Supergene Mediated Differential Expression Analysis

This is an R markdown document illustrating the bioinformatic methods used to generate the results for the manuscript by Arsenault et al. entitled "Simple inheritance, complex regulation: supergene-mediated fire ant queen polymorphism". If you have any questions about this document, please direct them to Sam Arsenault at samarsenault93@gmail.com or Brendan G. Hunt at brendan.hunt@gmail.com

Load in Relevant Libraries

library(edgeR)
library(UpSetR)
library(readr)
library(magick)
library(gridExtra)
library(karyoploteR)
library(cowplot)
library(ggplot2)
library(topGO)
library(data.table)
library(eulerr)
library(pheatmap)
library(CircStats)
library(RColorBrewer)
library(randtests)
library(readxl)
library(rtracklayer)
library(rentrez)
library(dplyr)

Load in Relevant Custom Functions

make_polar_DEG <- function(TT_table,radial_lim) {
  ## Things to Add: 
  library(ggplot2)
  #  library(gridExtra)
  library(cowplot)
  internal <- TT_table
  internal$neglogFDR <- -log10(TT_table$FDR)
  internal$angle <- atan2(TT_table$logFC.GenotypeBb,TT_table$logFC.Genotypebb)
  internal$abslogBb <- abs(TT_table$logFC.GenotypeBb)
  internal$abslogbb <- abs(TT_table$logFC.Genotypebb)
  internal$length <- internal[,c("abslogBb","abslogbb")][cbind(seq_len(nrow(internal)), max.col(internal[,c("abslogBb","abslogbb")]))]
  d=data.frame(angle=c(atan2(1,2),atan2(1,1),atan2(2,1),atan2(1,0),atan2(1,-1),atan2(0,-1),atan2(-1,-2),atan2(-1,-1),atan2(-2,-1),atan2(-1,0),atan2(-1,1),atan2(0,1)), 
               category=c("bb>Bb>BB", "bb=Bb>BB", "Bb>bb>BB", "Bb>BB=bb", "Bb>BB>bb", "Bb=BB>bb","BB>Bb>bb","BB>Bb=bb","BB>bb>Bb","BB=bb>Bb","bb>BB>Bb","bb>Bb=BB"))
  gg_scatter <- ggplot() +
    geom_point(data=internal, aes(x=angle, y=neglogFDR),shape=1) + 
    theme_bw() +
    theme(axis.text.x=element_blank()) + 
    coord_polar(theta="x",clip="off") +
    scale_x_continuous(breaks = seq(-pi, pi, pi/3), limits = c(-pi, pi)) +
    scale_y_continuous(breaks = seq(0,radial_lim,5),limits = c(0,radial_lim)) +
    geom_vline(xintercept = c(atan2(1,2),atan2(1,1),atan2(2,1),atan2(1,0),atan2(1,-1),atan2(0,-1),atan2(-1,-2),atan2(-1,-1),atan2(-2,-1),atan2(-1,0),atan2(-1,1),atan2(0,1))) +
    geom_histogram(d=internal,aes(x=angle,y=10*..ncount..),fill="green", alpha=0.3,bins=60) + 
    geom_text(data=d, mapping=aes(x=angle, y=radial_lim, label=category), size=4, angle=0, vjust=0, hjust=0.5)  
  return(gg_scatter)
} ## Make the Polar coordinate Dominance Plot

make_volcano <- function(et,Title) {
  library(ggplot2)
  resFilt <- topTags(et, n=nrow(et$table))
  
  volcanoData <- cbind(resFilt$table$logFC, -log10(resFilt$table$FDR))
  colnames(volcanoData) <- c("logFC", "negLogPval")
  volcanoData <- data.frame(volcanoData)
  volcanoData$Sig <- volcanoData$negLogPval > log10(0.01)*(-1)
  x_bounds = round(max(abs(volcanoData$logFC)))
  y_bound = round(volcanoData$negLogPval)
  # first set up the plot
  volcPlot <- ggplot(volcanoData,aes(logFC,negLogPval))
  # then add the points
  volcPlot <- volcPlot + geom_point(aes(colour = Sig),alpha=0.4, size=2)
  #  volcPlot <- volcPlot + geom_hline(yintercept = log10(0.01)*(-1),size=1) + geom_vline(xintercept = -1,size=1) + geom_vline(xintercept = 1,size=1) + geom_vline(xintercept = 0,size=1) + 
  volcPlot <- volcPlot + geom_hline(yintercept = log10(0.01)*(-1),size=1) + geom_vline(xintercept = 0,size=1) + 
    labs(x = "logFC", y = "-log10(P-value)",title=Title) + 
    theme(legend.position="none", axis.title.x = element_text(face="bold", size=16), axis.text.x  = element_text(size=12), axis.title.y = element_text(face="bold", size=16), axis.text.y  = element_text(size=12)) + 
    scale_x_continuous(limits=c((-1)*x_bounds,x_bounds),breaks=c((-1)*x_bounds,-1,0,1,x_bounds)) + 
    # scale_y_continuous(limits=c(0,y_bound)) + 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black"),plot.title = element_text(hjust = 0.5))
  test <- topTags(et,p.value = 0.01,n=100000)
  length(test$table$logFC)
  LO <- sum(test$table$logFC < (-1))
  LI <- sum((test$table$logFC < 0)&(test$table$logFC >= (-1)))
  RI <- sum((test$table$logFC <= 1)&(test$table$logFC > 0))
  RO <- sum(test$table$logFC > 1)
  print(LO)
  print(LI)
  print(RI)
  print(RO)
  return(volcPlot)
  
} ## Create a standardized volcano plot for a given edgeR QLFtest output.

generate_GO_table <- function(data,Name) {
  QLF_TT_all <- topTags(data,p.value = 1,n=Inf)$table
  QLF_TT_sig <- topTags(data,p.value = 0.01,n=Inf)$table
  
  ## Load in custom annotation
  sinv_GO <- readMappings("~/Desktop/Sinv_Analyses/NCBI_genome/NCBI_lg_gene.annot.map", sep = "\t", IDsep=",")
  
  ## Load Specific Gene List 
  myInterestingGenes <- row.names(QLF_TT_sig)
  
  ## Load Full Gene Set
  geneNames <- row.names(QLF_TT_all)
  
  ## Create geneList Structure 
  geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
  names(geneList) <- geneNames
  
  ## Prep for file output
  output_file = Name
  
  ### Run for BP
  
  ## Laod the data into a topGO data object (BP,MF,CC)
  GOData <- new("topGOdata",ontology="BP",allGenes=geneList,description="GO analysis of DEGs with q<0.01", annot = annFUN.gene2GO, gene2GO = sinv_GO)
  
  ## Compute GO term enrichment
  resultClassic <- runTest(GOData, algorithm="classic", statistic="fisher")
  resultElim <- runTest(GOData, algorithm="elim", statistic="fisher")
  resultTopgo <- runTest(GOData, algorithm="weight01", statistic="fisher")
  
  allGO = usedGO(object = GOData) 
  
  allRes <- GenTable(GOData, classicFisher = resultClassic, elimFisher = resultElim, topgoFisher = resultTopgo, orderBy = "topgoFisher", ranksOf = "topgoFisher", topNodes = length(allGO))
  
  aa <- sapply(unname(sapply(allRes$GO.ID, function(x) {genes<-genesInTerm(GOData, x); genes[[1]][sapply(genes[[1]],function(id) id %in% myInterestingGenes)]})),paste0,collapse=",")
  allRes$genes <- aa
  
  res_print <- allRes[(allRes$topgoFisher<=0.05),c(1:5,8)]
  output_file4 = paste(output_file,"TopGO_BP_sig.pdf", sep="_")
  pdf(file = output_file4,width = 10)
  plot.new()
  grid.table(res_print,rows=NULL)
  dev.off()
  
  GOData <- new("topGOdata",ontology="MF",allGenes=geneList,description="GO analysis of DEGs with q<0.01", annot = annFUN.gene2GO, gene2GO = sinv_GO)
  
  ## Compute GO term enrichment
  resultClassic <- runTest(GOData, algorithm="classic", statistic="fisher")
  resultElim <- runTest(GOData, algorithm="elim", statistic="fisher")
  resultTopgo <- runTest(GOData, algorithm="weight01", statistic="fisher")
  
  allGO = usedGO(object = GOData) 
  
  allRes <- GenTable(GOData, classicFisher = resultClassic, elimFisher = resultElim, topgoFisher = resultTopgo, orderBy = "topgoFisher", ranksOf = "topgoFisher", topNodes = length(allGO))
  
  aa <- sapply(unname(sapply(allRes$GO.ID, function(x) {genes<-genesInTerm(GOData, x); genes[[1]][sapply(genes[[1]],function(id) id %in% myInterestingGenes)]})),paste0,collapse=",")
  allRes$genes <- aa
  
  res_print <- allRes[(allRes$topgoFisher<=0.05),c(1:5,8)]
  output_file4 = paste(output_file,"TopGO_MF_sig.pdf", sep="_")
  pdf(file = output_file4,width = 10)
  plot.new()
  grid.table(res_print,rows=NULL)
  dev.off()
  
  GOData <- new("topGOdata",ontology="CC",allGenes=geneList,description="GO analysis of DEGs with q<0.01", annot = annFUN.gene2GO, gene2GO = sinv_GO)
  
  ## Compute GO term enrichment
  resultClassic <- runTest(GOData, algorithm="classic", statistic="fisher")
  resultElim <- runTest(GOData, algorithm="elim", statistic="fisher")
  resultTopgo <- runTest(GOData, algorithm="weight01", statistic="fisher")
  
  allGO = usedGO(object = GOData) 
  
  allRes <- GenTable(GOData, classicFisher = resultClassic, elimFisher = resultElim, topgoFisher = resultTopgo, orderBy = "topgoFisher", ranksOf = "topgoFisher", topNodes = length(allGO))
  
  aa <- sapply(unname(sapply(allRes$GO.ID, function(x) {genes<-genesInTerm(GOData, x); genes[[1]][sapply(genes[[1]],function(id) id %in% myInterestingGenes)]})),paste0,collapse=",")
  allRes$genes <- aa
  
  res_print <- allRes[(allRes$topgoFisher<=0.05),c(1:5,8)]
  output_file4 = paste(output_file,"TopGO_CC_sig.pdf", sep="_")
  pdf(file = output_file4,width = 10)
  plot.new()
  grid.table(res_print,rows=NULL)
  dev.off()
  
  return()
} ## Compute and Print tables of GO term enrichment values

convert_LOC <- function(dataframe) {
  library(rtracklayer)
  library(rentrez)
  Si_gnG_Ensembl <- readGFF(file="~/Downloads/Solenopsis_invicta.Si_gnG.41.gff3")
  Si_gnG_refseq <- readGFF(file="/Users/samarsenault/Desktop/Sinv_Analyses/NCBI_genome/GCF_000188075.1_Si_gnG/GCF_000188075.1_Si_gnG_genomic.gff")
  refseq_translation <- data.frame(Si_gnG_refseq[(Si_gnG_refseq$type=="gene")|(Si_gnG_refseq$type=="pseudogene"),c("ID","Name")])
  Ensembl_translation <- data.frame(Si_gnG_Ensembl[(Si_gnG_Ensembl$type=="gene")|(Si_gnG_Ensembl$type=="pseudogene"),c("ID","description")])
  Ensembl_translation$ID <- gsub("gene:", "", Ensembl_translation$ID)
  merged_translation <- merge(x=refseq_translation,y=Ensembl_translation,by.x="Name",by.y="ID")
  g <- merge(x=dataframe,y=merged_translation,by.y="ID",by.x=0,all.x=TRUE)
  for (i in 1:nrow(g)){
    if (!is.na(g$Name[i])&is.na(g$description[i])){
      search_term <- gsub(pattern = "LOC",replacement = "",x = g$Name[i])
      g$description[i] <- entrez_summary(db="gene",id = search_term)[3]
    }
  }
  return(g)
} ## Add descriptions and proper gene names (LOC) to the data.frame

generate_heatmap_frame <- function(gene_names){
  library(karyoploteR)
  temp <- Sinv_annot[(row.names(Sinv_annot) %in% gene_names),]
  temp.sig<-temp[(grepl("lg",temp$chr)),]
  temp.sig <- temp.sig[complete.cases(temp.sig), ]
  DEG_GR <- makeGRangesFromDataFrame(temp.sig, seqnames.field = "chr",start.field = "start",end.field = "end",keep.extra.columns = TRUE)
  windows <- unlist(tileGenome(stats::setNames(kp$chromosome.lengths, kp$chromosomes),tilewidth = 2e6))
  dens <- countOverlaps(windows, DEG_GR)
  values(windows) <- data.frame(y = dens)
  return(windows)
} ## Takes a list of gene names and makes a heatmap object to pass to karyoploteR

generate_heatmap_frame_SNP <- function(SNP_data){
  library(karyoploteR)
  DEG_GR <- makeGRangesFromDataFrame(SNP_data, seqnames.field = "lg",start.field = "newstart",end.field = "newend",keep.extra.columns = TRUE)
  windows <- unlist(tileGenome(stats::setNames(kp$chromosome.lengths, kp$chromosomes),tilewidth = 500000))
  dens <- countOverlaps(windows, DEG_GR)
  values(windows) <- data.frame(y = dens)
  return(windows)
} ## Takes a list of SNPs and makes a heatmap object to pass to karyoploteR

check_supergene_presence_bias <- function(all_genes_TT_table){
  all_genes_TT_table_merge <- merge(all_genes_TT_table,Sinv_annot_lg,by.x=0,by.y="X4",all.x=TRUE)
  supergene_genes_int <- all_genes_TT_table_merge[((all_genes_TT_table_merge$X1=="lg16")&(all_genes_TT_table_merge$X2>=7311525)&(all_genes_TT_table_merge$X3<=18123987)),c("Row.names")]
  supergene_genes_int <- supergene_genes_int[!is.na(supergene_genes_int)]
  nonsuper_genes_int <- all_genes_TT_table_merge[((grepl(pattern = "lg",x = all_genes_TT_table_merge$X1))&(all_genes_TT_table_merge$Row.names %!in% supergene_genes_int)),c("Row.names")]
  
  in_super_sig <- all_genes_TT_table[((row.names(all_genes_TT_table) %in% supergene_genes_int)&(all_genes_TT_table$FDR<=0.01)),]
  in_super_not <- all_genes_TT_table[((row.names(all_genes_TT_table) %in% supergene_genes_int)&(all_genes_TT_table$FDR>0.01)),]
  out_super_sig <- all_genes_TT_table[((row.names(all_genes_TT_table) %in% nonsuper_genes_int)&(all_genes_TT_table$FDR<=0.01)),]
  out_super_not <- all_genes_TT_table[((row.names(all_genes_TT_table) %in% nonsuper_genes_int)&(all_genes_TT_table$FDR>0.01)),]
  out <- c(nrow(in_super_sig),nrow(in_super_not),nrow(out_super_sig),nrow(out_super_not),nrow(all_genes_TT_table))
  print(out[1:4])
  print(sum(out[1:4])-out[5])
  chi_out <- chisq.test(matrix(out[1:4],nrow=2),correct = FALSE)
  print(chi_out$observed)
  print(chi_out$expected)
  print(chi_out$p.value)
  print(fisher.test(x=matrix(out[1:4],nrow=2),alternative = "two.sided"))
  return()
} ## Takes a full TopTags file (all p-values) and checks for an overrepresentation of significant DEGs in the supergene as opposed to outside of it. 

check_supergene_dir_bias <- function(sig_genes_TT_table){
  sig_genes_TT_table_merge <- merge(sig_genes_TT_table,Sinv_annot_lg,by.x=0,by.y="X4",all.x=TRUE)
  supergene_genes_int <- sig_genes_TT_table_merge[((sig_genes_TT_table_merge$X1=="lg16")&(sig_genes_TT_table_merge$X2>=7311525)&(sig_genes_TT_table_merge$X3<=18123987)),c("Row.names")]
  supergene_genes_int <- supergene_genes_int[!is.na(supergene_genes_int)]
  nonsuper_genes_int <- sig_genes_TT_table_merge[((grepl(pattern = "lg",x = sig_genes_TT_table_merge$X1))&(sig_genes_TT_table_merge$Row.names %!in% supergene_genes_int)),c("Row.names")]
  in_super_up <- sig_genes_TT_table[((row.names(sig_genes_TT_table) %in% supergene_genes_int)&(sig_genes_TT_table$logFC>0)),]
  in_super_down <- sig_genes_TT_table[((row.names(sig_genes_TT_table) %in% supergene_genes_int)&(sig_genes_TT_table$logFC<0)),]
  out_super_up <- sig_genes_TT_table[((row.names(sig_genes_TT_table) %in% nonsuper_genes_int)&(sig_genes_TT_table$logFC>0)),]
  out_super_down <- sig_genes_TT_table[((row.names(sig_genes_TT_table) %in% nonsuper_genes_int)&(sig_genes_TT_table$logFC<0)),]
  out <- c(nrow(in_super_up),nrow(in_super_down),nrow(out_super_up),nrow(out_super_down),nrow(sig_genes_TT_table))
  print(out[1:4])
  print(sum(out[1:4])-out[5])
  chi_out <- chisq.test(matrix(out[1:4],nrow=2),correct = FALSE)
  print(chi_out$observed)
  print(chi_out$expected)
  print(chi_out$p.value)
  print(fisher.test(x=matrix(out[1:4],nrow=2),alternative = "two.sided"))
  return()
} ## Takes a table of significant DEGs and assesses whether or not there is a directional bias of differential expression within versus outside of the supergene. 

perform_runs_test_sig <- function(all_TT){
  add_pos <- merge(all_TT$table,Sinv_annot_lg,by.x=0,by.y="X4")
  supergene_bits <- add_pos[((add_pos$X1 == "lg16")&(add_pos$X2>=7311525)&(add_pos$X3<=18123987)),]
  supergene_order <- supergene_bits[order(supergene_bits[,'X2']),]
  supergene_order_sig <- supergene_order[(supergene_order$FDR<=0.01),]
  runs.test(supergene_order$FDR,plot=T,threshold = 0.01)
  #runs.test(supergene_order_sig$logFC,plot=T,threshold = 0)
}

perform_runs_test_dir <- function(all_TT){
  add_pos <- merge(all_TT$table,Sinv_annot_lg,by.x=0,by.y="X4")
  supergene_bits <- add_pos[((add_pos$X1 == "lg16")&(add_pos$X2>=7311525)&(add_pos$X3<=18123987)),]
  supergene_order <- supergene_bits[order(supergene_bits[,'X2']),]
  supergene_order_sig <- supergene_order[(supergene_order$FDR<=0.01),]
  #runs.test(supergene_order$FDR,plot=T,threshold = 0.01)
  runs.test(supergene_order_sig$logFC,plot=T,threshold = 0)
}

'%!in%' <- function(x,y)!('%in%'(x,y)) ## Quality of life function

General Information Load

## Load in the gene lengths computed from the annotation for RPKM calculation
len_file=read.delim(file="/Users/samarsenault/Desktop/Sinv_Analyses/Sinv_NCBI_counts/Genelength.tsv",sep="\t")
head(len_file)
##   GeneID Length
## 1  gene0   1839
## 2  gene1    894
## 3  gene2   2253
## 4  gene3    870
## 5  gene4     76
## 6  gene5    972
## Load in the counts file generated using the 2-pass method from STAR
x <- read.delim("/Users/samarsenault/Desktop/Sinv_Analyses/Sinv_NCBI_counts/NCBI_All_counts.tsv",row.names=1,sep="\t")
head(x)
##       lib_104AB lib_104AO lib_104DB lib_104GB lib_104GO lib_107AB lib_107AO
## gene0         0        27         0         9        27         0        24
## gene1         0         0         0         0         0         0         0
## gene2         1        16         0         0         7         0        24
## gene3     23142      2727     21787     22937      2385     10573      3340
## gene4         0         0         0         0         0         0         0
## gene5        14         4         0         8         4         4         3
##       lib_107EB lib_107EO lib_107GB lib_107GO lib_15AB lib_15AO lib_15DB
## gene0         0         9         0        10        0       10        0
## gene1         0         0         0         0        0        0        0
## gene2        29        10        88         8       41        4        5
## gene3     15382      4155     17307      4557     6960     7256     5436
## gene4         0         0         0         0        0        0        0
## gene5        15         1        20         0       12        0        2
##       lib_15DO lib_15GB lib_15GO lib_16CB lib_16CO lib_16DB lib_16DO lib_16GB
## gene0       20        0        0        0        0        0       21        0
## gene1        0        0        0        0        0        0        0        0
## gene2       14        0       16       27       25        0        9        0
## gene3     3270    15016     3305     1937     2446     7587     2199     6104
## gene4        0        0        0        0        0        0        0        0
## gene5        1        2        1       34        0       25        0        6
##       lib_16GO lib_19AB lib_19AO lib_19DB lib_19DO lib_19GB lib_19GO lib_1EB
## gene0        3        0        8        0        0        0        6       2
## gene1        0        0        0        0        0        0        0       0
## gene2       11       20        3       38        8        3       12       5
## gene3     3032    13182     3015    12381     2358    13280     7304     904
## gene4        0        0        0        0        0        0        0       0
## gene5        0       25        1       82        0       25        0      48
##       lib_1EO lib_207AB lib_207AO lib_209AB lib_209AO lib_20AB lib_20AO
## gene0      57        42         5         0         0        5       13
## gene1       0         0         0         0         0        0        0
## gene2      15        71         2        41         9       86       11
## gene3    1416     11555      3013     16333      4266     9281     3669
## gene4       0         0         0         0         0        0        0
## gene5       3        73         0        24         0       35        0
##       lib_20DB lib_20DO lib_20IB lib_20IO lib_222AB lib_222AO lib_232AB
## gene0        0        2        0       14         0         5         0
## gene1        0        0        0        0         0         0         0
## gene2       24       11        2       11        26         8        44
## gene3    24916     4310    17208     4228     31449      3370     10125
## gene4        0        0        0        0         0         0         0
## gene5       19        0       23        0        13         0         9
##       lib_232AO lib_233AB lib_233AO lib_235AB lib_235AO lib_239AB lib_239AO
## gene0         4         0         4         0         0         0         4
## gene1         0         0         0         0         0         0         0
## gene2        10         0         4        24         5         0         9
## gene3      3138     12777      3220     12547      5417     25319      4123
## gene4         0         0         0         0         0         0         0
## gene5         0        36         0        20         0         1         0
##       lib_240AB lib_240AO lib_30AB lib_30AO lib_30EB lib_30EO lib_30GB lib_30GO
## gene0         0         0        2       13        0        3        0        1
## gene1         0         0        0        0        0        0        0        0
## gene2         0         5      267        2        5        8       89        8
## gene3     17585      3988     5901     2905     7458     4653    11744     2184
## gene4         0         0        0        0        0        0        0        0
## gene5        15         0        6        0       11        0       12        0
##       lib_5AB lib_5AO lib_5GB lib_5GO
## gene0       0       9      10       8
## gene1       0       0       0       0
## gene2      44      11      15      10
## gene3    4405    3841    5980    2152
## gene4       0       0       0       0
## gene5      35       0      18       0
## Load in the model matrix containing information formatted for both pairiwse and GLM comparisons
adv_group <- read.delim("/Users/samarsenault/Desktop/Sinv_Analyses/Sinv_model_matrix5.tsv",row.names = 1,sep = "\t")
head(adv_group)
##       Individual_ID Colony_ID Tissue Social.Form Genotype            Alt_ID
## 104AB          104A       104  Brain    Polygyne       Bb Brain.Polygyne.Bb
## 104AO          104A       104  Ovary    Polygyne       Bb Ovary.Polygyne.Bb
## 104DB          104D       104  Brain    Polygyne       BB Brain.Polygyne.BB
## 104GB          104G       104  Brain    Polygyne       bb Brain.Polygyne.bb
## 104GO          104G       104  Ovary    Polygyne       bb Ovary.Polygyne.bb
## 107AB          107A       107  Brain    Polygyne       Bb Brain.Polygyne.Bb
## Load in the annotation information from a bed file
Sinv_annot <- read_delim("~/Desktop/Sinv_Analyses/NCBI_genome/Genetic_Mapping/NCBI_lg_gene_v3.bed", 
                         "\t", escape_double = FALSE, col_names = FALSE, 
                         trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character()
## )
row.names(Sinv_annot) <- Sinv_annot$X4 ## Rename the rows of the this variable with gene names
## Warning: Setting row names on a tibble is deprecated.
colnames(Sinv_annot) <- c("chr","start","end","gene") ## rename the columns with better description
head(Sinv_annot)
## # A tibble: 6 x 4
##   chr         start   end gene     
##   <chr>       <dbl> <dbl> <chr>    
## 1 NC_014672.1 10229 11174 gene15723
## 2 NC_014672.1 14342 15323 gene15724
## 3 NC_014672.1  4157  4507 gene15717
## 4 NC_014672.1  4929  6596 gene15718
## 5 NC_014672.1  8466  8996 gene15721
## 6 NC_014672.1  9021 10142 gene15722
## Load in the linkage mapped genome annotations
Sinv_annot_lg <- read_delim("~/Desktop/Sinv_Analyses/NCBI_genome/Genetic_Mapping/NCBI_lg_gene_v4.bed", 
                         "\t", escape_double = FALSE, col_names = FALSE, 
                         trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_character()
## )

Pairwise Comparisons

y_pw <- DGEList(x,group = adv_group$Alt_ID)

y_pw$samples
##                       group lib.size norm.factors
## lib_104AB Brain.Polygyne.Bb 23839271            1
## lib_104AO Ovary.Polygyne.Bb 21048826            1
## lib_104DB Brain.Polygyne.BB 23454386            1
## lib_104GB Brain.Polygyne.bb 24389394            1
## lib_104GO Ovary.Polygyne.bb 20495668            1
## lib_107AB Brain.Polygyne.Bb 20141694            1
## lib_107AO Ovary.Polygyne.Bb 17197423            1
## lib_107EB Brain.Polygyne.BB 21829149            1
## lib_107EO Ovary.Polygyne.BB 19294902            1
## lib_107GB Brain.Polygyne.bb 25168093            1
## lib_107GO Ovary.Polygyne.bb 16074944            1
## lib_15AB  Brain.Polygyne.Bb 14960249            1
## lib_15AO  Ovary.Polygyne.Bb 16752710            1
## lib_15DB  Brain.Polygyne.BB 13227223            1
## lib_15DO  Ovary.Polygyne.BB 19369746            1
## lib_15GB  Brain.Polygyne.bb 15342322            1
## lib_15GO  Ovary.Polygyne.bb 15462774            1
## lib_16CB  Brain.Polygyne.Bb 10874244            1
## lib_16CO  Ovary.Polygyne.Bb 18167411            1
## lib_16DB  Brain.Polygyne.BB 18118617            1
## lib_16DO  Ovary.Polygyne.BB 15154985            1
## lib_16GB  Brain.Polygyne.bb 14252480            1
## lib_16GO  Ovary.Polygyne.bb 11697881            1
## lib_19AB  Brain.Polygyne.Bb 21454629            1
## lib_19AO  Ovary.Polygyne.Bb 14055961            1
## lib_19DB  Brain.Polygyne.BB 22242852            1
## lib_19DO  Ovary.Polygyne.BB 12250091            1
## lib_19GB  Brain.Polygyne.bb 22026200            1
## lib_19GO  Ovary.Polygyne.bb 18419244            1
## lib_1EB   Brain.Polygyne.BB  9465077            1
## lib_1EO   Ovary.Polygyne.BB 15489624            1
## lib_207AB Brain.Monogyne.BB 23545677            1
## lib_207AO Ovary.Monogyne.BB 13189348            1
## lib_209AB Brain.Monogyne.BB 25094494            1
## lib_209AO Ovary.Monogyne.BB 18649952            1
## lib_20AB  Brain.Polygyne.Bb 20937228            1
## lib_20AO  Ovary.Polygyne.Bb 19166278            1
## lib_20DB  Brain.Polygyne.BB 21474982            1
## lib_20DO  Ovary.Polygyne.BB 20286421            1
## lib_20IB  Brain.Polygyne.bb 23393220            1
## lib_20IO  Ovary.Polygyne.bb 19737048            1
## lib_222AB Brain.Monogyne.BB 26950035            1
## lib_222AO Ovary.Monogyne.BB 18315290            1
## lib_232AB Brain.Monogyne.BB 15462514            1
## lib_232AO Ovary.Monogyne.BB 17487369            1
## lib_233AB Brain.Monogyne.BB 18101509            1
## lib_233AO Ovary.Monogyne.BB 16506089            1
## lib_235AB Brain.Monogyne.BB 17354499            1
## lib_235AO Ovary.Monogyne.BB 22630544            1
## lib_239AB Brain.Monogyne.BB 23630798            1
## lib_239AO Ovary.Monogyne.BB 19564509            1
## lib_240AB Brain.Monogyne.BB 19079343            1
## lib_240AO Ovary.Monogyne.BB 18214783            1
## lib_30AB  Brain.Polygyne.Bb 18465236            1
## lib_30AO  Ovary.Polygyne.Bb 13595740            1
## lib_30EB  Brain.Polygyne.BB 13622038            1
## lib_30EO  Ovary.Polygyne.BB 18007543            1
## lib_30GB  Brain.Polygyne.bb 16763253            1
## lib_30GO  Ovary.Polygyne.bb 10097662            1
## lib_5AB   Brain.Polygyne.Bb 13352005            1
## lib_5AO   Ovary.Polygyne.Bb 11809254            1
## lib_5GB   Brain.Polygyne.bb 16318389            1
## lib_5GO   Ovary.Polygyne.bb 11915595            1
## Filter out low count genes
keep <- rowSums(cpm(y_pw)>10/9) >= 8 
len_file <- len_file[ as.vector(keep) , ]
y_pw <- y_pw[keep, , keep.lib.sizes=FALSE]

## Normalize samples by library size
y_pw <- calcNormFactors(y_pw)
Background.genes <- row.names(y_pw$counts)

## Calculate RPKM values
adv_group$FullID <- paste(adv_group$Individual_ID,adv_group$Alt_ID,sep = ".")

## Begin Computing Differential Expression
design_pw <- model.matrix(~0+adv_group$Alt_ID,data=y_pw$samples)
colnames(design_pw) <- levels(y_pw$samples$group)

y_pw <- estimateDisp(y_pw,design_pw)
y_pw <- estimateGLMCommonDisp(y_pw, design_pw)
y_pw <- estimateGLMTrendedDisp(y_pw, design_pw)
y_pw <- estimateGLMTagwiseDisp(y_pw, design_pw)

## QLM Fit and DE analysis
fit_pw <- glmQLFit(y_pw,design_pw)

my.contrasts <- makeContrasts(Brain_M_v_P_BB=Brain.Monogyne.BB-Brain.Polygyne.BB, 
                              Brain_BB_v_Bb=Brain.Polygyne.BB-Brain.Polygyne.Bb, 
                              Brain_Bb_v_bb=Brain.Polygyne.Bb-Brain.Polygyne.bb, 
                              Brain_mBB_v_pBb=Brain.Monogyne.BB-Brain.Polygyne.Bb, 
                              Brain_mBB_v_pbb=Brain.Monogyne.BB-Brain.Polygyne.bb, 
                              Ovary_M_v_P_BB=Ovary.Monogyne.BB-Ovary.Polygyne.BB, 
                              Ovary_BB_v_Bb=Ovary.Polygyne.BB-Ovary.Polygyne.Bb, 
                              Ovary_Bb_v_bb=Ovary.Polygyne.Bb-Ovary.Polygyne.bb, 
                              Ovary_mBB_v_pBb=Ovary.Monogyne.BB-Ovary.Polygyne.Bb, 
                              Ovary_mBB_v_pbb=Ovary.Monogyne.BB-Ovary.Polygyne.bb, 
                              Brain_v_Ovary_pBB=Brain.Polygyne.BB-Ovary.Polygyne.BB,
                              Brain_v_Ovary_mBB=Brain.Monogyne.BB-Ovary.Monogyne.BB,
                              Brain_v_Ovary_pBb=Brain.Polygyne.Bb-Ovary.Polygyne.Bb,
                              Brain_v_Ovary_pbb=Brain.Polygyne.bb-Ovary.Polygyne.bb,
                              Ovary_BB_v_bb=Ovary.Polygyne.BB-Ovary.Polygyne.bb, 
                              Brain_BB_v_bb=Brain.Polygyne.BB-Brain.Polygyne.bb, 
                              levels=design_pw)

pw.qlf.Brain_M_v_P_BB <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_M_v_P_BB"])
pw.qlf.Brain_M_v_P_BB.TT <- topTags(pw.qlf.Brain_M_v_P_BB,p.value = 1,n=100000)
pw.qlf.Brain_M_v_P_BB.TT.sig <- topTags(pw.qlf.Brain_M_v_P_BB,p.value = 0.01,n=100000)$table
pw.qlf.Brain_M_v_P_BB.TT.names <- row.names(pw.qlf.Brain_M_v_P_BB.TT$table)

pw.qlf.Ovary_M_v_P_BB <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_M_v_P_BB"])
pw.qlf.Ovary_M_v_P_BB.TT <- topTags(pw.qlf.Ovary_M_v_P_BB,p.value = 1,n=100000)
pw.qlf.Ovary_M_v_P_BB.TT.sig <- topTags(pw.qlf.Ovary_M_v_P_BB,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_M_v_P_BB.TT.names <- row.names(pw.qlf.Ovary_M_v_P_BB.TT$table)

pw.qlf.Brain_BB_v_Bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_BB_v_Bb"])
pw.qlf.Brain_BB_v_Bb.TT <- topTags(pw.qlf.Brain_BB_v_Bb,p.value = 1,n=100000)
pw.qlf.Brain_BB_v_Bb.TT.sig <- topTags(pw.qlf.Brain_BB_v_Bb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_BB_v_Bb.TT.names <- row.names(pw.qlf.Brain_BB_v_Bb.TT$table)

pw.qlf.Ovary_BB_v_Bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_BB_v_Bb"])
pw.qlf.Ovary_BB_v_Bb.TT <- topTags(pw.qlf.Ovary_BB_v_Bb,p.value = 1,n=100000)
pw.qlf.Ovary_BB_v_Bb.TT.sig <- topTags(pw.qlf.Ovary_BB_v_Bb,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_BB_v_Bb.TT.names <- row.names(pw.qlf.Ovary_BB_v_Bb.TT$table)

pw.qlf.Brain_Bb_v_bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_Bb_v_bb"])
pw.qlf.Brain_Bb_v_bb.TT <- topTags(pw.qlf.Brain_Bb_v_bb,p.value = 1,n=100000)
pw.qlf.Brain_Bb_v_bb.TT.sig <- topTags(pw.qlf.Brain_Bb_v_bb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_Bb_v_bb.TT.names <- row.names(pw.qlf.Brain_Bb_v_bb.TT$table)

pw.qlf.Ovary_Bb_v_bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_Bb_v_bb"])
pw.qlf.Ovary_Bb_v_bb.TT <- topTags(pw.qlf.Ovary_Bb_v_bb,p.value = 1,n=100000)
pw.qlf.Ovary_Bb_v_bb.TT.sig <- topTags(pw.qlf.Ovary_Bb_v_bb,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_Bb_v_bb.TT.names <- row.names(pw.qlf.Ovary_Bb_v_bb.TT$table)

pw.qlf.Brain_v_Ovary_pBB <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_v_Ovary_pBB"])
pw.qlf.Brain_v_Ovary_pBB.TT <- topTags(pw.qlf.Brain_v_Ovary_pBB,p.value = 1,n=100000)
pw.qlf.Brain_v_Ovary_pBB.TT.sig <- topTags(pw.qlf.Brain_v_Ovary_pBB,p.value = 0.01,n=100000)$table
pw.qlf.Brain_v_Ovary_pBB.TT.names <- row.names(pw.qlf.Brain_v_Ovary_pBB.TT$table)

pw.qlf.Brain_v_Ovary_mBB <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_v_Ovary_mBB"])
pw.qlf.Brain_v_Ovary_mBB.TT <- topTags(pw.qlf.Brain_v_Ovary_mBB,p.value = 1,n=100000)
pw.qlf.Brain_v_Ovary_mBB.TT.sig <- topTags(pw.qlf.Brain_v_Ovary_mBB,p.value = 0.01,n=100000)$table
pw.qlf.Brain_v_Ovary_mBB.TT.names <- row.names(pw.qlf.Brain_v_Ovary_mBB.TT$table)

pw.qlf.Brain_v_Ovary_pBb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_v_Ovary_pBb"])
pw.qlf.Brain_v_Ovary_pBb.TT <- topTags(pw.qlf.Brain_v_Ovary_pBb,p.value = 1,n=100000)
pw.qlf.Brain_v_Ovary_pBb.TT.sig <- topTags(pw.qlf.Brain_v_Ovary_pBb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_v_Ovary_pBb.TT.names <- row.names(pw.qlf.Brain_v_Ovary_pBb.TT$table)

pw.qlf.Brain_v_Ovary_pbb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_v_Ovary_pbb"])
pw.qlf.Brain_v_Ovary_pbb.TT <- topTags(pw.qlf.Brain_v_Ovary_pbb,p.value = 1,n=100000)
pw.qlf.Brain_v_Ovary_pbb.TT.sig <- topTags(pw.qlf.Brain_v_Ovary_pbb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_v_Ovary_pbb.TT.names <- row.names(pw.qlf.Brain_v_Ovary_pbb.TT$table)

pw.qlf.Brain_BB_v_bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_BB_v_bb"])
pw.qlf.Brain_BB_v_bb.TT <- topTags(pw.qlf.Brain_BB_v_bb,p.value = 1,n=100000)
pw.qlf.Brain_BB_v_bb.TT.sig <- topTags(pw.qlf.Brain_BB_v_bb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_BB_v_bb.TT.names <- row.names(pw.qlf.Brain_BB_v_bb.TT$table)

pw.qlf.Ovary_BB_v_bb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_BB_v_bb"])
pw.qlf.Ovary_BB_v_bb.TT <- topTags(pw.qlf.Ovary_BB_v_bb,p.value = 1,n=100000)
pw.qlf.Ovary_BB_v_bb.TT.sig <- topTags(pw.qlf.Ovary_BB_v_bb,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_BB_v_bb.TT.names <- row.names(pw.qlf.Ovary_BB_v_bb.TT$table)

pw.qlf.Ovary_mBB_v_pBb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_mBB_v_pBb"])
pw.qlf.Ovary_mBB_v_pBb.TT <- topTags(pw.qlf.Ovary_mBB_v_pBb,p.value = 1,n=100000)
pw.qlf.Ovary_mBB_v_pBb.TT.sig <- topTags(pw.qlf.Ovary_mBB_v_pBb,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_mBB_v_pBb.TT.names <- row.names(pw.qlf.Ovary_mBB_v_pBb.TT$table)

pw.qlf.Ovary_mBB_v_pbb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Ovary_mBB_v_pbb"])
pw.qlf.Ovary_mBB_v_pbb.TT <- topTags(pw.qlf.Ovary_mBB_v_pbb,p.value = 1,n=100000)
pw.qlf.Ovary_mBB_v_pbb.TT.sig <- topTags(pw.qlf.Ovary_mBB_v_pbb,p.value = 0.01,n=100000)$table
pw.qlf.Ovary_mBB_v_pbb.TT.names <- row.names(pw.qlf.Ovary_mBB_v_pbb.TT$table)

pw.qlf.Brain_mBB_v_pBb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_mBB_v_pBb"])
pw.qlf.Brain_mBB_v_pBb.TT <- topTags(pw.qlf.Brain_mBB_v_pBb,p.value = 1,n=100000)
pw.qlf.Brain_mBB_v_pBb.TT.sig <- topTags(pw.qlf.Brain_mBB_v_pBb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_mBB_v_pBb.TT.names <- row.names(pw.qlf.Brain_mBB_v_pBb.TT$table)

pw.qlf.Brain_mBB_v_pbb <- glmQLFTest(fit_pw, contrast=my.contrasts[,"Brain_mBB_v_pbb"])
pw.qlf.Brain_mBB_v_pbb.TT <- topTags(pw.qlf.Brain_mBB_v_pbb,p.value = 1,n=100000)
pw.qlf.Brain_mBB_v_pbb.TT.sig <- topTags(pw.qlf.Brain_mBB_v_pbb,p.value = 0.01,n=100000)$table
pw.qlf.Brain_mBB_v_pbb.TT.names <- row.names(pw.qlf.Brain_mBB_v_pbb.TT$table)


## Generate Volcano Plots for the Pairiwse Comparisons
# Pairwise, genotype-specific comparisons (Figure S1,A-D)
make_volcano(pw.qlf.Brain_BB_v_Bb,"Brain SB/SB vs. SB/Sb")
## [1] 36
## [1] 1
## [1] 0
## [1] 5

make_volcano(pw.qlf.Brain_BB_v_bb,"Brain SB/SB vs. Sb/Sb")
## [1] 68
## [1] 6
## [1] 3
## [1] 67
## Warning: Removed 3 rows containing missing values (geom_point).

make_volcano(pw.qlf.Ovary_BB_v_Bb,"Ovary SB/SB vs. SB/Sb")
## [1] 40
## [1] 2
## [1] 1
## [1] 0
## Warning: Removed 1 rows containing missing values (geom_point).

make_volcano(pw.qlf.Ovary_BB_v_bb,"Ovary SB/SB vs. Sb/Sb")
## [1] 179
## [1] 20
## [1] 13
## [1] 28
## Warning: Removed 1 rows containing missing values (geom_point).

# Pairwise, social form comparisons (Embedded within Figure 3B,C)
make_volcano(pw.qlf.Brain_M_v_P_BB,"Brain Monogyne SB/SB vs. Polygyne SB/SB")
## [1] 13
## [1] 3
## [1] 1
## [1] 5

make_volcano(pw.qlf.Ovary_M_v_P_BB,"Ovary Monogyne SB/SB vs. Polygyne SB/SB")
## [1] 0
## [1] 0
## [1] 0
## [1] 0

# Pairwise, combinatorial-effect comparisons (Embedded within Figure 3B,C)
make_volcano(pw.qlf.Brain_mBB_v_pBb,"Brain Monogyne SB/SB vs. Polygyne SB/Sb")
## [1] 30
## [1] 0
## [1] 2
## [1] 4
## Warning: Removed 1 rows containing missing values (geom_point).

make_volcano(pw.qlf.Ovary_mBB_v_pBb,"Ovary Monogyne SB/SB vs. Polygyne SB/Sb")
## [1] 175
## [1] 14
## [1] 9
## [1] 7
## Warning: Removed 1 rows containing missing values (geom_point).

## Generate UpSetR plot of Pairwise comparisons (Figure 2C)
PW_set <- list(Brain_pBB_pBb=row.names(pw.qlf.Brain_BB_v_Bb.TT.sig),
                 Brain_pBB_pbb=row.names(pw.qlf.Brain_BB_v_bb.TT.sig),
                 Ovary_pBB_pBb=row.names(pw.qlf.Ovary_BB_v_Bb.TT.sig),
                 Ovary_pBB_pbb=row.names(pw.qlf.Ovary_BB_v_bb.TT.sig)
                 )
upset(fromList(PW_set),nsets=4,order.by = "freq")

## Generate Social Form Euler Plots (Figure 3B,C)
Tissue_set <- list(
  Brain_mBBvpBB=row.names(pw.qlf.Brain_M_v_P_BB.TT.sig),
  Ovary_mBBvpBB=row.names(pw.qlf.Ovary_M_v_P_BB.TT.sig),
  Brain_pBBvpBb=row.names(pw.qlf.Brain_BB_v_Bb.TT.sig),
  Ovary_pBBvpBb=row.names(pw.qlf.Ovary_BB_v_Bb.TT.sig),
  Brain_mBBvpBb=row.names(pw.qlf.Brain_mBB_v_pBb.TT.sig),
  Ovary_mBBvpBb=row.names(pw.qlf.Ovary_mBB_v_pBb.TT.sig)
)

brain_euler <- plot(euler(Tissue_set[c(1,3,5)], shape = "ellipse"), quantities = TRUE)
brain_euler

ovary_euler <- plot(euler(Tissue_set[c(2,4,6)], shape = "ellipse"), quantities = TRUE)
## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not
## a multiple of shorter object length
ovary_euler

Generalized Linear Model Analysis

## Format model matrix
Tissue <- factor(adv_group$Tissue)
Social.Form <- factor(adv_group$Social.Form)
Genotype_temp <- factor(adv_group$Genotype)
Genotype2 <- relevel(Genotype_temp, ref = "BB")

## Describe the Experiment Design and define Contrasts
glm_design <- model.matrix(~0 + Social.Form + Tissue * Genotype2, data = adv_group)

colnames(glm_design) <- c("Monogyne", "Polygyne", "TissueOvary", "Genotypebb", "GenotypeBb", 
    "Tissuebb", "TissueBb")

y_glm <- DGEList(x, samples = glm_design)

## Filter out low count genes
y_glm <- y_glm[(rowSums(cpm(y_glm) > 10/9) >= 8), , keep.lib.sizes = FALSE]  ## Based on the smallest number of aligned reads in our libraries as per edgeR documentation

### Normalize samples by library size
y_glm <- calcNormFactors(y_glm)

Background.genes <- row.names(y_glm$counts)

## Begin Computing Differential Expression
y_glm <- estimateDisp(y_glm, glm_design)
y_glm <- estimateGLMCommonDisp(y_glm, glm_design)
y_glm <- estimateGLMTrendedDisp(y_glm, glm_design)
y_glm <- estimateGLMTagwiseDisp(y_glm, glm_design)

## QLM Fit and DE analysis
fit_glm <- glmQLFit(y_glm, glm_design)

## Compute Differential expression based on number of supergene alleles DE
## Analysis for Paper
glm.QLF.BBvBb <- glmQLFTest(fit_glm, coef = "GenotypeBb")
glm.QLF.BBvBb_TT <- topTags(glm.QLF.BBvBb, n = Inf)
glm.QLF.BBvBb_TT_sig <- topTags(glm.QLF.BBvBb_TT, p.value = 0.01, n = Inf)$table

glm.QLF.BBvbb <- glmQLFTest(fit_glm, coef = "Genotypebb")
glm.QLF.BBvbb_TT <- topTags(glm.QLF.BBvbb, n = Inf)
glm.QLF.BBvbb_TT_sig <- topTags(glm.QLF.BBvbb_TT, p.value = 0.01, n = Inf)$table

glm.QLF.all <- glmQLFTest(fit_glm, coef = 4:5)
glm.QLF.all_TT <- topTags(glm.QLF.all, n = Inf)
glm.QLF.all_TT_sig <- topTags(glm.QLF.all_TT, p.value = 0.01, n = Inf)$table


## Compute differential expression based on social form
con.SF <- makeContrasts(Monogyne - Polygyne, levels = glm_design)
glm.QLF.SF <- glmQLFTest(fit_glm, contrast = con.SF)
glm.QLF.SF_TT <- topTags(glm.QLF.SF, n = Inf)
glm.QLF.SF_TT_sig <- topTags(glm.QLF.SF_TT, p.value = 0.01, n = Inf)$table

## Visualizations ## Note here that due to the way the contrasts are set up, the
## directionality of the volcano plots is flipped. This was rectified in the
## manuscript.  BB vs. Bb Volcano Plot (Figure 2A)
make_volcano(glm.QLF.BBvBb, "GLM BB vs. Bb")
## [1] 2
## [1] 1
## [1] 1
## [1] 46

# BB vs. bb Volcano Plot (Figure 2B)
make_volcano(glm.QLF.BBvbb, "GLM BB vs. bb")
## [1] 49
## [1] 3
## [1] 10
## [1] 91

# Monogyne vs Polygyne Volcano Plot (Figure 3A)
make_volcano(glm.QLF.SF, "GLM Monogyne vs. Polygyne")
## [1] 235
## [1] 61
## [1] 26
## [1] 8

GO Term Enrichment Analysis on pairwise and glm differential expression results

## GO Term Enrichment Analysis
generate_GO_table(pw.qlf.Brain_BB_v_Bb, "Brain_BBvBL")
## NULL
generate_GO_table(pw.qlf.Brain_BB_v_bb, "Brain_BBvLL")
## NULL
generate_GO_table(pw.qlf.Ovary_BB_v_Bb, "Ovary_BBvBL")
## NULL
generate_GO_table(pw.qlf.Ovary_BB_v_bb, "Ovary_BBvLL")
## NULL
generate_GO_table(glm.QLF.BBvBb, "GLM_BBvBL")
## NULL
generate_GO_table(glm.QLF.BBvbb, "GLM_BBvLL")
## NULL
generate_GO_table(glm.QLF.SF, "GLM_SF")
## NULL

Generate Genome-wide DE Figure

## Add Tags for Candidate Genes (These genes were derived from overlapping the various differential expression analyses)
load(file="~/Desktop/Sinv_Analyses/Candidate_gene_names.RData")
Candidate_Genes <- c(Naming_Frame_sub$Row.names,"gene1476")

temp.sig1 <- Sinv_annot_lg[((Sinv_annot_lg$X4 %in% Candidate_Genes)&(grepl("lg",Sinv_annot_lg$X1))),]
temp.sig1 <- temp.sig1[complete.cases(temp.sig1), ]
row.names(temp.sig1) <- temp.sig1$X4
## Warning: Setting row names on a tibble is deprecated.
temp.sig1 <- convert_LOC(temp.sig1)
## Appended gene descriptions that were not included due to an upstream error
temp.sig1[5,"description"] <- "dopamine receptor 1 "
temp.sig1[10,"description"] <- "pheromone-binding protein Gp-9-like"
temp.sig1[11,"description"] <- "NADH dehydrogenase [ubiquinone] 1 alpha subcomplex subunit 9, mitochondrial-like"
temp.sig1[14,"description"] <- "putative nuclease HARBI1"

DEG_GR1 <- makeGRangesFromDataFrame(temp.sig1, seqnames.field = "X1",start.field = "X2",end.field = "X3",keep.extra.columns = TRUE)
DEG_GR1$labels <- DEG_GR1$description

Chr_sizes_GR <- toGRanges("~/Desktop/Sinv_Analyses/lg_size.bed")
gene_bed_GR <- toGRanges("~/Desktop/Sinv_Analyses/NCBI_genome/Genetic_Mapping/NCBI_lgonly_gene_v4.bed")
supergene_GR <- GRanges(seqnames = "lg16",ranges=IRanges(7311525,18123987)) ## Need to check points!!!

pp <- getDefaultPlotParams(plot.type = 4)
pp$data1inmargin <- 5
pp$rightmargin <- 0.2
kp <- plotKaryotype(genome = Chr_sizes_GR, plot.type=4, ideogram.plotter = NULL, labels.plotter = NULL,plot.params = pp)
kpAddCytobandsAsLine(kp,lwd = 30)
kpAddChromosomeNames(kp, srt=90)
core_bb_windows <- generate_heatmap_frame(row.names(glm.QLF.BBvbb_TT_sig))
core_Bb_windows <- generate_heatmap_frame(row.names(glm.QLF.BBvBb_TT_sig))
Brain_Bb_windows <- generate_heatmap_frame(row.names(pw.qlf.Brain_BB_v_Bb.TT.sig))
Brain_bb_windows <- generate_heatmap_frame(row.names(pw.qlf.Brain_BB_v_bb.TT.sig))
Ovary_Bb_windows <- generate_heatmap_frame(row.names(pw.qlf.Ovary_BB_v_Bb.TT.sig))
Ovary_bb_windows <- generate_heatmap_frame(row.names(pw.qlf.Ovary_BB_v_bb.TT.sig))

## Add DEG density Heatmap
colfunc <- colorRampPalette(c("white", "blue"))
kpHeatmap(kp,data=core_bb_windows,r0=0.11,r1=0.15,colors = colfunc(5),ymin=0,ymax=20)
kpHeatmap(kp,data=core_Bb_windows,r0=0.16,r1=0.2,colors = colfunc(5),ymin=0,ymax=20)
kpHeatmap(kp,data=Brain_bb_windows,r0=0.21,r1=0.25,colors = colfunc(5),ymin=0,ymax=20)
kpHeatmap(kp,data=Brain_Bb_windows,r0=0.26,r1=0.3,colors = colfunc(5),ymin=0,ymax=20)
kpHeatmap(kp,data=Ovary_bb_windows,r0=0.31,r1=0.35,colors = colfunc(5),ymin=0,ymax=20)
kpHeatmap(kp,data=Ovary_Bb_windows,r0=0.36,r1=0.4,colors = colfunc(5),ymin=0,ymax=20)
## Add supergene Marker
kpPlotRegions(kp,data = supergene_GR,col=adjustcolor("darkgreen",alpha.f = 0.5), r0=0.05,r1=0.1)
kpPlotMarkers(kp, DEG_GR1,labels = DEG_GR1$description,r0 = 0.41,r1=0.5,ignore.chromosome.ends = TRUE,max.iter = 10000,label.dist = -0.001,clipping = TRUE,marker.parts = c(0.1,0.8,0.1))

## Determine if there is a statistical overrepresentation of DEGs within the supergene and if it is directional. 
## Check for overlap in supergene and DEGs for each pairwise comparison
check_supergene_presence_bias(glm.QLF.BBvBb_TT$table)
## [1]   17  354   17 8139
## [1] -2555
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]   17   17
## [2,]  354 8139
##            [,1]      [,2]
## [1,]   1.479301   32.5207
## [2,] 369.520699 8123.4793
## [1] 4.633262e-39
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 5.933e-15
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  10.91879 48.26582
## sample estimates:
## odds ratio 
##   22.96189
## NULL
check_supergene_presence_bias(glm.QLF.BBvbb_TT$table)
## [1]   46  325   50 8106
## [1] -2555
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]   46   50
## [2,]  325 8106
##           [,1]       [,2]
## [1,]   4.17685   91.82315
## [2,] 366.82315 8064.17685
## [1] 2.643213e-98
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  14.77568 35.49537
## sample estimates:
## odds ratio 
##   22.91676
## NULL
check_supergene_presence_bias(pw.qlf.Brain_BB_v_Bb.TT$table)
## [1]   14  357   14 8142
## [1] -2555
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]   14   14
## [2,]  357 8142
##            [,1]       [,2]
## [1,]   1.218248   26.78175
## [2,] 369.781752 8129.21825
## [1] 1.902539e-32
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 1.576e-12
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   9.984269 51.955997
## sample estimates:
## odds ratio 
##   22.77735
## NULL
check_supergene_presence_bias(pw.qlf.Brain_BB_v_bb.TT$table)
## [1]   38  333   58 8098
## [1] -2555
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]   38   58
## [2,]  333 8098
##           [,1]       [,2]
## [1,]   4.17685   91.82315
## [2,] 366.82315 8064.17685
## [1] 6.043095e-65
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  10.12979 24.77462
## sample estimates:
## odds ratio 
##   15.91616
## NULL
check_supergene_presence_bias(pw.qlf.Ovary_BB_v_Bb.TT$table)
## [1]    8  363   16 8140
## [1] -2555
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]    8   16
## [2,]  363 8140
##            [,1]       [,2]
## [1,]   1.044213   22.95579
## [2,] 369.955787 8133.04421
## [1] 3.172793e-12
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 4.744e-06
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   4.122905 27.963466
## sample estimates:
## odds ratio 
##   11.20021
## NULL
check_supergene_presence_bias(pw.qlf.Ovary_BB_v_bb.TT$table)
## [1]   40  331  117 8039
## [1] -2555
##      [,1] [,2]
## [1,]   40  117
## [2,]  331 8039
##           [,1]      [,2]
## [1,]   6.83089  150.1691
## [2,] 364.16911 8005.8309
## [1] 3.400547e-39
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   5.549275 12.199496
## sample estimates:
## odds ratio 
##   8.298817
## NULL
## Generate adjusted logFC versions of the results tables so pw and glm tables are comparable. 
glm.QLF.BBvBb_adj <- glm.QLF.BBvBb_TT_sig
glm.QLF.BBvBb_adj$logFC <- -1*glm.QLF.BBvBb_TT_sig$logFC
glm.QLF.BBvbb_adj <- glm.QLF.BBvbb_TT_sig
glm.QLF.BBvbb_adj$logFC <- -1*glm.QLF.BBvbb_TT_sig$logFC

check_supergene_dir_bias(glm.QLF.BBvBb_adj)
## [1]  1 16  2 15
## [1] -16
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]    1    2
## [2,]   16   15
##      [,1] [,2]
## [1,]  1.5  1.5
## [2,] 15.5 15.5
## [1] 0.5454172
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.007491873 10.104993853
## sample estimates:
## odds ratio 
##  0.4790176
## NULL
check_supergene_dir_bias(glm.QLF.BBvbb_adj) 
## [1] 18 28 22 28
## [1] -57
##      [,1] [,2]
## [1,]   18   22
## [2,]   28   28
##          [,1]     [,2]
## [1,] 19.16667 20.83333
## [2,] 26.83333 29.16667
## [1] 0.6287651
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 0.6818
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3345486 1.9934956
## sample estimates:
## odds ratio 
##  0.8198997
## NULL
check_supergene_dir_bias(pw.qlf.Brain_BB_v_Bb.TT.sig)
## [1]  1 13  4 10
## [1] -14
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]    1    4
## [2,]   13   10
##      [,1] [,2]
## [1,]  2.5  2.5
## [2,] 11.5 11.5
## [1] 0.1387917
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 0.3259
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.003637868 2.484508917
## sample estimates:
## odds ratio 
##  0.2033741
## NULL
check_supergene_dir_bias(pw.qlf.Brain_BB_v_bb.TT.sig)
## [1] 17 21 38 20
## [1] -48
##      [,1] [,2]
## [1,]   17   38
## [2,]   21   20
##          [,1]     [,2]
## [1,] 21.77083 33.22917
## [2,] 16.22917 24.77083
## [1] 0.04412524
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 0.05811
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.1693863 1.0681989
## sample estimates:
## odds ratio 
##  0.4300187
## NULL
check_supergene_dir_bias(pw.qlf.Ovary_BB_v_Bb.TT.sig)
## [1]  1  7  0 16
## [1] -19
## Warning in chisq.test(matrix(out[1:4], nrow = 2), correct = FALSE): Chi-squared
## approximation may be incorrect
##      [,1] [,2]
## [1,]    1    0
## [2,]    7   16
##           [,1]       [,2]
## [1,] 0.3333333  0.6666667
## [2,] 7.6666667 15.3333333
## [1] 0.1485618
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 0.3333
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.05128188        Inf
## sample estimates:
## odds ratio 
##        Inf
## NULL
check_supergene_dir_bias(pw.qlf.Ovary_BB_v_bb.TT.sig) 
## [1]  19  21  17 100
## [1] -83
##      [,1] [,2]
## [1,]   19   17
## [2,]   21  100
##           [,1]     [,2]
## [1,]  9.171975 26.82803
## [2,] 30.828025 90.17197
## [1] 1.852027e-05
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(out[1:4], nrow = 2)
## p-value = 5.337e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   2.192937 12.846506
## sample estimates:
## odds ratio 
##   5.250997
## NULL
## Take the results from above and visualize it as a stacked bar plot (Figure S2)
DE_dir_CT <- read.delim("~/Desktop/Sinv_Analyses/DE_dir_CT3.tsv")
DE_dir_CT$Comparison_f = factor(DE_dir_CT$Comparison, levels=c('Full BB vs. Bb','Brain BB vs. Bb','Ovary BB vs. Bb','Full BB vs. bb','Brain BB vs. bb','Ovary BB vs. bb'))
ggplot(data=DE_dir_CT, aes(x=Genome.Position, y=DEG_Ratio, fill=DE.Direction)) + 
  geom_bar(stat="identity") + 
  facet_grid(~Comparison_f) +   
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Allele-specific Expression Analysis

## Load SNP-level files in the individual GATK ASE Read Counter files (for all 16 Bb libraries)
setwd("~/Desktop/Sinv_Analyses/GATK_ASERC_mod/")
temp = list.files(pattern="*counts.csv")
temp2 <- sapply(strsplit(temp, split='_', fixed=TRUE), function(x) (x[1]))
myfiles = lapply(temp, read.delim)
# Note: Each File is saved as myfiles[[x]] as a data.frame

## merge first two columns into identifier column
for (i in 1:16){
  myfiles[[i]]$ID <- paste(myfiles[[i]]$contig,myfiles[[i]]$position,myfiles[[i]]$altAllele,sep = "~")
  colnames(myfiles[[i]]) <- paste(colnames(myfiles[[i]]), temp2[i], sep = ".")
  colnames(myfiles[[i]])[14] <- "ID"
}

## merge all data.frames by ID
merged <- Reduce(function(...) merge(..., by='ID', all.x=FALSE), myfiles)

## Load gene-level files (made using bedtools intersect) in the individual GATK ASE Read Counter files (for all 16 Bb libraries)
temp_gene = list.files(pattern="*genes.tsv")
temp2_gene <- sapply(strsplit(temp_gene, split='_', fixed=TRUE), function(x) (x[1]))
myfiles_gene = lapply(temp_gene, read.delim)
# Note: Each File is saved as myfiles[[x]] as a data.frame

## merge first two columns into identifier column
for (i in 1:16){
  colnames(myfiles_gene[[i]]) <- paste(colnames(myfiles_gene[[i]]), temp2_gene[i], sep = ".")
  colnames(myfiles_gene[[i]])[1] <- "gene"
}

## merge all data.frames by ID
merged_genes <- Reduce(function(...) merge(..., by='gene', all.x=FALSE), myfiles_gene)
pull_cols_genes <- grepl("refCount|altCount",colnames(merged_genes))
row.names(merged_genes) <- merged_genes$gene
x_genes <- merged_genes[,pull_cols_genes]

## Extract the specific columns that I want (refCount and altCount)
pull_cols <- grepl("refCount|altCount",colnames(merged))
row.names(merged) <- merged$ID
x <- merged[,pull_cols]

## Load in model matrix (Modified to analyze ASE)
adv_group_ASE <- read.delim("/Users/samarsenault/Desktop/Sinv_Analyses/Sinv_model_matrix6.tsv",sep = "\t")
head(adv_group_ASE)
##   Sample Individual_ID Colony_ID Tissue Social.Form Genotype            Alt_ID
## 1  104AB          104A       104  Brain    Polygyne       Bb Brain.Polygyne.Bb
## 2  104AB          104A       104  Brain    Polygyne       Bb Brain.Polygyne.Bb
## 3  104AO          104A       104  Ovary    Polygyne       Bb Ovary.Polygyne.Bb
## 4  104AO          104A       104  Ovary    Polygyne       Bb Ovary.Polygyne.Bb
## 5  107AB          107A       107  Brain    Polygyne       Bb Brain.Polygyne.Bb
## 6  107AB          107A       107  Brain    Polygyne       Bb Brain.Polygyne.Bb
##        Allele
## 1   Reference
## 2 Alternative
## 3   Reference
## 4 Alternative
## 5   Reference
## 6 Alternative
Colony_ID <- factor(adv_group_ASE$Colony_ID)
Tissue <- factor(adv_group_ASE$Tissue)
Allele <- factor(adv_group_ASE$Allele)

## Describe the Experiment Design and define Contrasts
ASE_design <- model.matrix(~0+Tissue*Allele,data=adv_group_ASE)

y_ASE <- DGEList(x,samples=ASE_design)

y_ASE <- estimateDisp(y_ASE,ASE_design)
y_ASE <- estimateGLMCommonDisp(y_ASE, ASE_design)
y_ASE <- estimateGLMTrendedDisp(y_ASE, ASE_design)
y_ASE <- estimateGLMTagwiseDisp(y_ASE, ASE_design)

## QLM Fit and DE analysis
fit_ASE <- glmQLFit(y_ASE,ASE_design)

## DE Analysis
QLF.ASE <- glmQLFTest(fit_ASE,coef="AlleleReference") 
QLF.ASE_TT <- topTags(QLF.ASE,n=Inf)
QLF.ASE_TT_sig <- topTags(QLF.ASE_TT,p.value = 0.01,n=Inf)$table
QLF.ASE_TT_sig$SNPs <- row.names(QLF.ASE_TT_sig)

make_volcano(QLF.ASE,"SB vs. Sb in SB/Sb individuals")
## [1] 79
## [1] 14
## [1] 6
## [1] 61
## Warning: Removed 1 rows containing missing values (geom_point).

## Note: Positive values in volcano plot indicate higher expression in reference. 


## Describe the Experiment Design and define Contrasts
y_gene<- DGEList(x_genes,samples=ASE_design)

y_gene <- estimateDisp(y_gene,ASE_design)
y_gene <- estimateGLMCommonDisp(y_gene, ASE_design)
y_gene <- estimateGLMTrendedDisp(y_gene, ASE_design)
y_gene <- estimateGLMTagwiseDisp(y_gene, ASE_design)

## QLM Fit and DE analysis
fit_gene <- glmQLFit(y_gene,ASE_design)

## DE Analysis
QLF.ASE.gene <- glmQLFTest(fit_gene,coef="AlleleReference") 
QLF.ASE.gene_TT <- topTags(QLF.ASE.gene,n=Inf)
QLF.ASE.gene_TT_sig <- topTags(QLF.ASE.gene_TT,p.value = 0.01,n=Inf)$table
QLF.ASE.gene_TT_all <- topTags(QLF.ASE.gene_TT,p.value = 1,n=Inf)$table

## Generate ASE Gene Volcano plot (Figure 4A)
make_volcano(QLF.ASE.gene,"Alternatively Spliced Genes") 
## [1] 10
## [1] 6
## [1] 4
## [1] 9

## Note: Positive values in volcano plot indicate higher expression in reference. 

## Break the topTags file into a better data.frame
QLF.ASE_TT_df <- QLF.ASE_TT$table
QLF.ASE_TT_df$chr <- sapply(strsplit(row.names(QLF.ASE_TT_df), split='~', fixed=TRUE), function(x) (x[1]))
QLF.ASE_TT_df$start <- sapply(strsplit(row.names(QLF.ASE_TT_df), split='~', fixed=TRUE), function(x) (x[2]))
QLF.ASE_TT_df$end <- sapply(strsplit(row.names(QLF.ASE_TT_df), split='~', fixed=TRUE), function(x) (x[2]))
## merge with linkage map. 
genetic_map <- read.delim("~/Desktop/Sinv_Analyses/NCBI_genome/Genetic_Mapping/genetic_map.txt", comment.char="#")
head (genetic_map)
##         scaffold start     end orientation  lg lg_position partial
## 1 NW_011798938.1     1  955721   uncertain lg1           1   FALSE
## 2 NW_011794913.1     1 1181834           + lg1      955822   FALSE
## 3 NW_011795398.1     1 1293734           - lg1     2137756   FALSE
## 4 NW_011801858.1     1 1483336           - lg1     3431590   FALSE
## 5 NW_011798146.1     1  451599           - lg1     4915026   FALSE
## 6 NW_011800891.1     1 3265217           + lg1     5366725   FALSE
## Map the SNPs (based on gnG genome) to the linkage map. 
QLF.ASE_TT_df_merge <- merge(QLF.ASE_TT_df,genetic_map,by.x="chr",by.y="scaffold")
for (i in 1:nrow(QLF.ASE_TT_df_merge)){
  if (QLF.ASE_TT_df_merge$orientation[i]=="-"){
    QLF.ASE_TT_df_merge$newstart[i] <- QLF.ASE_TT_df_merge$lg_position[i] + (QLF.ASE_TT_df_merge$end.y[i]-as.numeric(QLF.ASE_TT_df_merge$start.x[i]))
  } else {
    QLF.ASE_TT_df_merge$newstart[i] <- QLF.ASE_TT_df_merge$lg_position[i] + as.numeric(QLF.ASE_TT_df_merge$start.x[i]) - as.numeric(QLF.ASE_TT_df_merge$start.y[i])
  }
}
QLF.ASE_TT_df_merge$newend <- QLF.ASE_TT_df_merge$newstart
QLF.ASE_TT_df_merge <- QLF.ASE_TT_df_merge[(QLF.ASE_TT_df_merge$FDR<=0.01),]

## Add in LOC IDs for the ASE gene frames
QLF.ASE.gene_TT_sig_LOC <- convert_LOC(QLF.ASE.gene_TT_sig)
QLF.ASE.gene_TT_all_LOC <- convert_LOC(QLF.ASE.gene_TT_all)


## Make ASE DEG Upset Plot (Figure 4B)
##Filter Gene name lists down by which genes were analyzed in ASE and DE
ASE_background <- QLF.ASE.gene_TT_all_LOC$Row.names
Upset_backgroud <- intersect(Background.genes,ASE_background)
ASE_genes_b <- QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$FDR <= 0.01)&(QLF.ASE.gene_TT_all_LOC$logFC<0),c("Row.names")]
ASE_genes_B <- QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$FDR <= 0.01)&(QLF.ASE.gene_TT_all_LOC$logFC>0),c("Row.names")]
DE_genes_b <- row.names(glm.QLF.BBvBb_TT_sig[(glm.QLF.BBvBb_TT_sig$logFC>0),])
DE_genes_B <- row.names(glm.QLF.BBvBb_TT_sig[(glm.QLF.BBvBb_TT_sig$logFC<0),])

ASE_genes_b <- intersect(ASE_genes_b,Upset_backgroud)
ASE_genes_B <- intersect(ASE_genes_B,Upset_backgroud)
DE_genes_b <- intersect(DE_genes_b,Upset_backgroud)
DE_genes_B <- intersect(DE_genes_B,Upset_backgroud)

Overlap <- list(ASE_b=ASE_genes_b,
                ASE_B=ASE_genes_B,
                DE_SBSb=DE_genes_b,
                DE_SBSB=DE_genes_B
)
upset(fromList(Overlap),nsets=4,order.by = "freq",text.scale = 1.4)

## Extract overlap information

b_overlap <- ASE_genes_b[(ASE_genes_b %in% DE_genes_b)]
b_overlap_data <- QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$Row.names %in% b_overlap),]

B_overlap <- ASE_genes_B[ASE_genes_B %in% DE_genes_B]
QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$Row.names %in% B_overlap),]
##    Row.names    logFC   logCPM        F       PValue          FDR         Name
## 61 gene11836 1.152001 11.61619 47.65609 7.660374e-08 2.429433e-06 LOC105203076
##                                                     description
## 61 nucleolar protein 16 [Source:RefSeq mRNA;Acc:XM_011171831.1]
nonB_overlap <- ASE_genes_B[!(ASE_genes_B %in% DE_genes_B)]
dosage_comp_B <- QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$Row.names %in% nonB_overlap),]

nonb_overlap <- ASE_genes_b[!(ASE_genes_b %in% DE_genes_b)]
dosage_comp_b <- QLF.ASE.gene_TT_all_LOC[(QLF.ASE.gene_TT_all_LOC$Row.names %in% nonb_overlap),]

Visualize Supergene Localization

## Set some plotting parameters for the karyoploteR
plot.params <- getDefaultPlotParams(plot.type = 1)
plot.params$data1height=300
plot.params$data1outmargin = 20
plot.params$data1inmargin = 5
plot.params$ideogramheight = 10
col.over <- "#FFBD07AA" #Yellow
col.under <- "#00A6EDAA" #Blue
col.insig <- "#7F7F7F"

## Begin Creating the tracks for Figure 5
kp <- plotKaryotype(genome = Chr_sizes_GR, cytobands = gene_bed_GR,chromosomes = c("lg16"),plot.params = plot.params)
## Warning in ideogram.plotter(kp, ...): No 'gieStain' column found in cytobands.
## Using 'gpos50' (gray) for all of them
kpAddBaseNumbers(kp,tick.dist = 1000000,add.units = TRUE)

## Add Supergene
kpPlotRegions(kp,data = supergene_GR,col="darkgreen", r0=0,r1=0.01)

## Add in various inverions from Huang et al.  (from 0 to 0.1)
gr <- GRanges(seqnames = "lg16", strand = c("+", "+", "+"),
              ranges = IRanges(start = c(7948943,8793193,18097796), end = c(8793193,18097796,18098299)))
values(gr) <- DataFrame(labels = c("Inversion A","Inversion B","Inversion C"))
kpPlotRegions(kp,data = gr[1],col="purple", r0=0.02,r1=0.03)
kpPlotRegions(kp,data = gr[2],col="orange", r0=0.02,r1=0.03)
kpPlotRegions(kp,data = gr[3],col="red", r0=0.02,r1=0.03)

## Plot Gene density across Lg16
kpPlotDensity(kp, gene_bed_GR, window.size = 500000, col="#ddaacc",r0 = 0.04,r1 = 0.09)

## Core BBvbb
QLF.BBvbb_adj.merge2 <- merge(glm.QLF.BBvbb_TT_sig,Sinv_annot_lg,by.x=0,by.y="X4")
QLF.BBvbb_adj.merge2$logFC <- -1*QLF.BBvbb_adj.merge2$logFC
QLF.BBvbb_adj.merge <- QLF.BBvbb_adj.merge2[QLF.BBvbb_adj.merge2$X1=="lg16",]
DEG_GR <- makeGRangesFromDataFrame(QLF.BBvbb_adj.merge, seqnames.field = "X1",start.field = "X2",end.field = "X3",keep.extra.columns = TRUE)

fc.ymax=ceiling(max(abs(DEG_GR$logFC)))
fc.ymin=-fc.ymax
cex.val <- -log10(DEG_GR$FDR)/15 + 0.7
sign.col <- rep(col.insig, length(DEG_GR))
sign.col[(DEG_GR$logFC<0)&(DEG_GR$FDR<=0.01)] <- col.under
sign.col[(DEG_GR$logFC>0)&(DEG_GR$FDR<=0.01)] <- col.over
kpPoints(kp, data=DEG_GR, y=DEG_GR$logFC, ymax=fc.ymax, ymin=fc.ymin,cex=cex.val, col=sign.col,r0 = 0.14, r1 = 0.29)
kpAxis(kp, ymax=fc.ymax, ymin=fc.ymin,side = 2,cex=0.5,r0 = 0.14, r1 = 0.29)

DEG_density_B <- generate_heatmap_frame(QLF.BBvbb_adj.merge[QLF.BBvbb_adj.merge$logFC>0,]$Row.names)
DEG_density_b <- generate_heatmap_frame(QLF.BBvbb_adj.merge[QLF.BBvbb_adj.merge$logFC<0,]$Row.names)

colfunc <- colorRampPalette(c("lightcyan1", "red"))
kpHeatmap(kp,data=DEG_density_B,r0=0.3,r1=0.33,colors = colfunc(5))
kpHeatmap(kp,data=DEG_density_b,r0=0.1+0,r1=0.13,colors = colfunc(5))

## Core BBvBb
QLF.BBvBb_adj.merge2 <- merge(glm.QLF.BBvBb_TT_sig,Sinv_annot_lg,by.x=0,by.y="X4")
QLF.BBvBb_adj.merge <- QLF.BBvBb_adj.merge2[QLF.BBvBb_adj.merge2$X1=="lg16",]
DEG_GR <- makeGRangesFromDataFrame(QLF.BBvBb_adj.merge, seqnames.field = "X1",start.field = "X2",end.field = "X3",keep.extra.columns = TRUE)

fc.ymax=ceiling(max(abs(DEG_GR$logFC)))
fc.ymin=-fc.ymax
cex.val <- -log10(DEG_GR$FDR)/15 + 0.7
sign.col <- rep(col.insig, length(DEG_GR))
sign.col[(DEG_GR$logFC<0)&(DEG_GR$FDR<=0.01)] <- col.under
sign.col[(DEG_GR$logFC>0)&(DEG_GR$FDR<=0.01)] <- col.over
kpPoints(kp, data=DEG_GR, y=DEG_GR$logFC, ymax=fc.ymax, ymin=fc.ymin,cex=cex.val, col=sign.col,r0 = 0.39, r1 = 0.54)
kpAxis(kp, ymax=fc.ymax, ymin=fc.ymin,side = 2,cex=0.5,r0 = 0.38, r1 = 0.54)

DEG_density_B <- generate_heatmap_frame(QLF.BBvBb_adj.merge[QLF.BBvBb_adj.merge$logFC>0,]$Row.names)
DEG_density_b <- generate_heatmap_frame(QLF.BBvBb_adj.merge[QLF.BBvBb_adj.merge$logFC<0,]$Row.names)

colfunc <- colorRampPalette(c("lightcyan1", "red"))
kpHeatmap(kp,data=DEG_density_b,r0=0.35,r1=0.38,colors = colfunc(5))
kpHeatmap(kp,data=DEG_density_B,r0=0.55,r1=0.58,colors = colfunc(5))

## Add in ASE Heatmap and B vs. b dots 0.6-0.8

ASE_GR <- makeGRangesFromDataFrame(QLF.ASE_TT_df_merge, seqnames.field = "lg",start.field = "newstart",end.field = "newend",keep.extra.columns = TRUE)

fc.ymax=ceiling(max(abs(ASE_GR$logFC)))
fc.ymin=-fc.ymax
cex.val <- -log10(ASE_GR$FDR)/15 + 0.7
sign.col <- rep(col.insig, length(ASE_GR))
col.over2 <- rgb(125, 125, 0, max = 255, alpha = 100) #Yellow
col.under2 <- rgb(0, 0, 255, max = 255, alpha = 100) #Blue
sign.col[(ASE_GR$logFC<0)&(ASE_GR$FDR<=0.01)] <- col.under2
sign.col[(ASE_GR$logFC>0)&(ASE_GR$FDR<=0.01)] <- col.over2
kpPoints(kp, data=ASE_GR, y=ASE_GR$logFC, ymax=fc.ymax, ymin=fc.ymin,cex=cex.val, col=sign.col,r0 = 0.64, r1 = 0.79)
kpAxis(kp, ymax=fc.ymax, ymin=fc.ymin,side = 2,cex=0.5,r0 = 0.63, r1 = 0.79)

SNP_density_B <- generate_heatmap_frame_SNP(QLF.ASE_TT_df_merge[(QLF.ASE_TT_df_merge$logFC>0),])
SNP_density_b <- generate_heatmap_frame_SNP(QLF.ASE_TT_df_merge[(QLF.ASE_TT_df_merge$logFC<0),])

colfunc <- colorRampPalette(c("lightcyan1", "red"))
kpHeatmap(kp,data=SNP_density_B,r0=0.80,r1=0.83,colors = colfunc(5))
kpHeatmap(kp,data=SNP_density_b,r0=0.60,r1=0.63,colors = colfunc(5))

## It is worth mentioning here that a few of the genes that are differentially expressed are not actually counted in this location analysis because they are located on the cusp of the linkage mapping information.Consequently, it was decided to exclude them from this analysis as their position cannot be confidently decided.
## Perform Run's Test on DEGs, Testing for non-uniformity in significance
perform_runs_test_sig(glm.QLF.BBvBb_TT)

## 
##  Runs Test
## 
## data:  supergene_order$FDR
## statistic = -1.4708, runs = 31, n1 = 354, n2 = 17, n = 371, p-value =
## 0.1414
## alternative hypothesis: nonrandomness
perform_runs_test_sig(glm.QLF.BBvbb_TT)

## 
##  Runs Test
## 
## data:  supergene_order$FDR
## statistic = -0.14242, runs = 81, n1 = 325, n2 = 46, n = 371, p-value =
## 0.8868
## alternative hypothesis: nonrandomness
## Perform Run's Test on DEGs, Testing for non-uniformity in directionality of significant DEGs
perform_runs_test_dir(glm.QLF.BBvBb_TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = 0.36515, runs = 3, n1 = 16, n2 = 1, n = 17, p-value = 0.715
## alternative hypothesis: nonrandomness
perform_runs_test_dir(glm.QLF.BBvbb_TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = -1.5396, runs = 18, n1 = 28, n2 = 18, n = 46, p-value =
## 0.1237
## alternative hypothesis: nonrandomness
## Perform Run's Test on DEGs, Testing for non-uniformity in directionality of significant DEGs in specific tissues
perform_runs_test_dir(pw.qlf.Brain_BB_v_Bb.TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = 0.40825, runs = 3, n1 = 1, n2 = 13, n = 14, p-value =
## 0.6831
## alternative hypothesis: nonrandomness
perform_runs_test_dir(pw.qlf.Ovary_BB_v_Bb.TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = 0.57735, runs = 3, n1 = 1, n2 = 7, n = 8, p-value = 0.5637
## alternative hypothesis: nonrandomness
perform_runs_test_dir(pw.qlf.Brain_BB_v_bb.TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = -1.2608, runs = 16, n1 = 17, n2 = 21, n = 38, p-value =
## 0.2074
## alternative hypothesis: nonrandomness
perform_runs_test_dir(pw.qlf.Ovary_BB_v_bb.TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = -0.30513, runs = 20, n1 = 19, n2 = 21, n = 40, p-value =
## 0.7603
## alternative hypothesis: nonrandomness
## Perform Run's Test on ASE genes for significance and directionality
perform_runs_test_sig(QLF.ASE.gene_TT)

## 
##  Runs Test
## 
## data:  supergene_order$FDR
## statistic = 1.6682, runs = 57, n1 = 192, n2 = 29, n = 221, p-value =
## 0.09527
## alternative hypothesis: nonrandomness
perform_runs_test_dir(QLF.ASE.gene_TT)

## 
##  Runs Test
## 
## data:  supergene_order_sig$logFC
## statistic = -0.13188, runs = 15, n1 = 13, n2 = 16, n = 29, p-value =
## 0.8951
## alternative hypothesis: nonrandomness
## Runs ASE by SNP

ASE_logFC <- QLF.ASE_TT_df_merge[order(QLF.ASE_TT_df_merge$newend),c("logFC")]
runs.test(ASE_logFC,plot=T,threshold = 0)

## 
##  Runs Test
## 
## data:  ASE_logFC
## statistic = -5.5217, runs = 45, n1 = 67, n2 = 93, n = 160, p-value =
## 3.356e-08
## alternative hypothesis: nonrandomness

Generate Polar Plot

## Merge with linkage mapped annotation to determine supergene vs. nonsupergene genes
glm.QLF.all_TT_sig_merge <- merge(glm.QLF.all_TT_sig,Sinv_annot_lg,by.x=0,by.y="X4",all.x=TRUE)

supergene_genes <- glm.QLF.all_TT_sig_merge[((glm.QLF.all_TT_sig_merge$X1=="lg16")&(glm.QLF.all_TT_sig_merge$X2>=7311525)&(glm.QLF.all_TT_sig_merge$X3<=18123987)),c("Row.names")]
supergene_genes <- supergene_genes[!is.na(supergene_genes)]
nonsuper_genes <- glm.QLF.all_TT_sig_merge[((grepl(pattern = "lg",x = glm.QLF.all_TT_sig_merge$X1))&(glm.QLF.all_TT_sig_merge$Row.names %!in% supergene_genes)),c("Row.names")]

## Generate Polar Plot with only the genes in the supergene (Figure S3,B,C)
NCBI_supergene_polar <- make_polar_DEG(glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% supergene_genes),],15)
NCBI_supergene_polar
## Warning: Removed 1 rows containing missing values (geom_point).

NCBI_nonsupergene_polar <- make_polar_DEG(glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% nonsuper_genes),],10)
NCBI_nonsupergene_polar

## Extract the angles of expression using an arctangent and then compare the radial distributions using a watson's two test
nonsuper_angle <- atan2(glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% nonsuper_genes),c("logFC.GenotypeBb")],glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% nonsuper_genes),c("logFC.Genotypebb")])
supergene_angle <- atan2(glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% supergene_genes),c("logFC.GenotypeBb")],glm.QLF.all_TT_sig[(row.names(glm.QLF.all_TT_sig) %in% supergene_genes),c("logFC.Genotypebb")])
watson.two(nonsuper_angle, supergene_angle, alpha=0, plot=TRUE)

## 
##        Watson's Two-Sample Test of Homogeneity 
##  
## Test Statistic: 0.2196 
## 0.01 < P-value < 0.05 
## 
## Generate Dominance angle data.frame
dominance_angle <- data.frame(gene=row.names(glm.QLF.all_TT$table),angle=atan2(glm.QLF.all_TT$table$logFC.GenotypeBb,glm.QLF.all_TT$table$logFC.Genotypebb))
row.names(dominance_angle) <- dominance_angle$gene

Merge the results data frames into a compendium sheet

## Extract the results from the necessary data.frames and rename columns
Brain_Bb <- pw.qlf.Brain_BB_v_Bb.TT$table[,c(1,2,5)]
colnames(Brain_Bb) <- c("Brain_Bb_logFC","Brain_Bb_logCPM","Brain_Bb_FDR")

Brain_bb <- pw.qlf.Brain_BB_v_bb.TT$table[,c(1,2,5)]
colnames(Brain_bb) <- c("Brain_bb_logFC","Brain_bb_logCPM","Brain_bb_FDR")

Ovary_Bb <- pw.qlf.Ovary_BB_v_Bb.TT$table[,c(1,2,5)]
colnames(Ovary_Bb) <- c("Ovary_Bb_logFC","Ovary_Bb_logCPM","Ovary_Bb_FDR")

Ovary_bb <- pw.qlf.Ovary_BB_v_bb.TT$table[,c(1,2,5)]
colnames(Ovary_bb) <- c("Ovary_bb_logFC","Ovary_bb_logCPM","Ovary_bb_FDR")

Multifactorial_Bb <- glm.QLF.BBvBb_TT$table[,c(1,2,5)]
colnames(Multifactorial_Bb) <- c("Multifactorial_Bb_logFC","Multifactorial_Bb_logCPM","Multifactorial_Bb_FDR")

Multifactorial_bb <- glm.QLF.BBvbb_TT$table[,c(1,2,5)]
colnames(Multifactorial_bb) <- c("Multifactorial_bb_logFC","Multifactorial_bb_logCPM","Multifactorial_bb_FDR")

ASE_gene <- QLF.ASE.gene_TT_all[,c(1,2,5)]
colnames(ASE_gene) <- c("ASE_logFC","ASE_logCPM","ASE_FDR")

Sinv_annot_formerge <- data.frame(Sinv_annot_lg)
row.names(Sinv_annot_formerge) <- Sinv_annot_formerge$X4
colnames(Sinv_annot_formerge) <- c("chr","start","end","gene")
Sinv_annot_formerge <- Sinv_annot_formerge[,1:3]

Si_gnG_refseq <- readGFF(file="/Users/samarsenault/Desktop/Sinv_Analyses/NCBI_genome/GCF_000188075.1_Si_gnG/GCF_000188075.1_Si_gnG_genomic.gff")
refseq_LOC <- data.frame(Si_gnG_refseq[(Si_gnG_refseq$type=="gene")|(Si_gnG_refseq$type=="pseudogene"),c("ID","Name")])
refseq_desc_old <- data.frame(Si_gnG_refseq[(Si_gnG_refseq$type=="mRNA"),c("gene","product")])
refseq_desc = refseq_desc_old[order(refseq_desc_old[,'gene']),]
refseq_desc = refseq_desc[!duplicated(refseq_desc$gene),]
## Remove Redundent rows (genes with multiple transcripts in the annotation)
refseq_merge <- merge(refseq_LOC,refseq_desc,by.x="Name",by.y="gene",all.x=TRUE)
refseq_merge <- distinct(refseq_merge)
row.names(refseq_merge) <- refseq_merge$ID
refseq_merge <- refseq_merge[,c(1,3)]
colnames(refseq_merge) <- c("LOC_ID","Description")

temp1 <- merge(Sinv_annot_formerge,refseq_merge,by=0,all=T)
temp1 <- merge(temp1,Brain_Bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,Brain_bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,Ovary_Bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,Ovary_bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,Multifactorial_Bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,Multifactorial_bb,by.x="Row.names",by.y=0,all=T)
temp1 <- merge(temp1,ASE_gene,by.x="Row.names",by.y=0,all=T)
All_Data_merge <- merge(temp1,dominance_angle,by.x="Row.names",by.y=0,all=T)
write.table(All_Data_merge,file = "~/Desktop/Sinv_Analyses/All_Data_merge_new.tsv",quote = F,sep = "\t",row.names = F)

## Extract dosage compensation candidate genes
b_up_DosComp <- All_Data_merge[((All_Data_merge$Multifactorial_Bb_FDR>0.01)&(All_Data_merge$ASE_logFC<0)&(All_Data_merge$ASE_FDR<=0.01)&!is.na(All_Data_merge$ASE_FDR)),]
B_up_DosComp <- All_Data_merge[((All_Data_merge$Multifactorial_Bb_FDR>0.01)&(All_Data_merge$ASE_logFC>0)&(All_Data_merge$ASE_FDR<=0.01)&!is.na(All_Data_merge$ASE_FDR)),]

Odorant Binding Protein Analysis

## Import the OBP-aligned RNA-seq Data
x1_OBP <- read_delim("~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all", 
                  "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   GeneID = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 52 parsing failures.
## row col   expected     actual                                                     file
##   1  -- 64 columns 65 columns '~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all'
##   2  -- 64 columns 65 columns '~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all'
##   3  -- 64 columns 65 columns '~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all'
##   4  -- 64 columns 65 columns '~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all'
##   5  -- 64 columns 65 columns '~/Desktop/Sinv_Analyses/NCBI_genome/OBP_counts/OBP.all'
## ... ... .......... .......... ........................................................
## See problems(...) for more details.
x_OBP <- data.frame(x1_OBP[1:52,2:64],row.names = x1_OBP$GeneID[1:52])

## Generate data.frame for pairwise OBP analyses
y_OBP <- DGEList(x_OBP,group = adv_group$Alt_ID)

keep_OBP <- rowSums(cpm(y_OBP)>10/9) >= 8 ## NEED TO EDIT!!!
y_OBP <- y_OBP[keep_OBP, , keep.lib.sizes=FALSE]

## Normalize samples by library size
y_OBP <- calcNormFactors(y_OBP)
obp.Background.genes <- row.names(y_OBP$counts)


## Begin Computing Differential Expression
y_OBP <- estimateDisp(y_OBP,design_pw)
y_OBP <- estimateGLMCommonDisp(y_OBP, design_pw)
y_OBP <- estimateGLMTrendedDisp(y_OBP, design_pw)
y_OBP <- estimateGLMTagwiseDisp(y_OBP, design_pw)

## QLM Fit and DE analysis
## Note: For more conservative estimates, use glmQLFTest vs. glmLRT
fit_OBP <- glmQLFit(y_OBP,design_pw)



obp.qlf.Brain_BB_v_Bb <- glmQLFTest(fit_OBP, contrast=my.contrasts[,"Brain_BB_v_Bb"])
obp.qlf.Brain_BB_v_Bb.TT <- topTags(obp.qlf.Brain_BB_v_Bb,p.value = 1,n=100000)
obp.qlf.Brain_BB_v_Bb.TT.sig <- topTags(obp.qlf.Brain_BB_v_Bb,p.value = 0.01,n=100000)$table
obp.qlf.Brain_BB_v_Bb.TT.names <- row.names(obp.qlf.Brain_BB_v_Bb.TT$table)

obp.qlf.Ovary_BB_v_Bb <- glmQLFTest(fit_OBP, contrast=my.contrasts[,"Ovary_BB_v_Bb"])
obp.qlf.Ovary_BB_v_Bb.TT <- topTags(obp.qlf.Ovary_BB_v_Bb,p.value = 1,n=100000)
obp.qlf.Ovary_BB_v_Bb.TT.sig <- topTags(obp.qlf.Ovary_BB_v_Bb,p.value = 0.01,n=100000)$table
obp.qlf.Ovary_BB_v_Bb.TT.names <- row.names(obp.qlf.Ovary_BB_v_Bb.TT$table)

obp.qlf.Brain_BB_v_bb <- glmQLFTest(fit_OBP, contrast=my.contrasts[,"Brain_BB_v_bb"])
obp.qlf.Brain_BB_v_bb.TT <- topTags(obp.qlf.Brain_BB_v_bb,p.value = 1,n=100000)
obp.qlf.Brain_BB_v_bb.TT.sig <- topTags(obp.qlf.Brain_BB_v_bb,p.value = 0.01,n=100000)$table
obp.qlf.Brain_BB_v_bb.TT.names <- row.names(obp.qlf.Brain_BB_v_bb.TT$table)

obp.qlf.Ovary_BB_v_bb <- glmQLFTest(fit_OBP, contrast=my.contrasts[,"Ovary_BB_v_bb"])
obp.qlf.Ovary_BB_v_bb.TT <- topTags(obp.qlf.Ovary_BB_v_bb,p.value = 1,n=100000)
obp.qlf.Ovary_BB_v_bb.TT.sig <- topTags(obp.qlf.Ovary_BB_v_bb,p.value = 0.01,n=100000)$table
obp.qlf.Ovary_BB_v_bb.TT.names <- row.names(obp.qlf.Ovary_BB_v_bb.TT$table)


## Now the GLM version of the analysis
y_obp_glm <- DGEList(x_OBP,samples=glm_design)

## Filter out low count genes
y_obp_glm <- y_obp_glm[keep_OBP, , keep.lib.sizes=FALSE]

### Normalize samples by library size
y_obp_glm <- calcNormFactors(y_obp_glm)

## Begin Computing Differential Expression
y_obp_glm <- estimateDisp(y_obp_glm,glm_design)
y_obp_glm <- estimateGLMCommonDisp(y_obp_glm, glm_design)
y_obp_glm <- estimateGLMTrendedDisp(y_obp_glm, glm_design)
y_obp_glm <- estimateGLMTagwiseDisp(y_obp_glm, glm_design)

## QLM Fit and DE analysis
## Note: For more conservative estimates, use glmQLFTest vs. glmLRT
fit_obp_glm <- glmQLFit(y_obp_glm,glm_design)

## Compute Differential expression based on number of supergene alleles
## DE Analysis for Paper
obp.QLF.BBvBb <- glmQLFTest(fit_obp_glm,coef="GenotypeBb") 
obp.QLF.BBvBb_TT <- topTags(obp.QLF.BBvBb,n=Inf)
obp.QLF.BBvBb_TT_sig <- topTags(obp.QLF.BBvBb_TT,p.value = 0.01,n=Inf)$table

obp.QLF.BBvbb <- glmQLFTest(fit_obp_glm,coef="Genotypebb") 
obp.QLF.BBvbb_TT <- topTags(obp.QLF.BBvbb,n=Inf)
obp.QLF.BBvbb_TT_sig <- topTags(obp.QLF.BBvbb_TT,p.value = 0.01,n=Inf)$table

## Compute differential expression based on social form
obp.QLF.SF <- glmQLFTest(fit_obp_glm,contrast=makeContrasts(Monogyne - Polygyne, levels=glm_design))
obp.QLF.SF_TT <- topTags(obp.QLF.SF,n=Inf)
obp.QLF.SF_TT_sig <- topTags(obp.QLF.SF_TT,p.value = 0.01,n=Inf)$table


## Reorder the dataframes
obp.qlf.Brain_BB_v_Bb_order <- obp.qlf.Brain_BB_v_Bb.TT$table[ order(row.names(obp.qlf.Brain_BB_v_Bb.TT$table)), ]
obp.qlf.Ovary_BB_v_Bb_order <- obp.qlf.Ovary_BB_v_Bb.TT$table[ order(row.names(obp.qlf.Ovary_BB_v_Bb.TT$table)), ]
obp.qlf.Brain_BB_v_bb_order <- obp.qlf.Brain_BB_v_bb.TT$table[ order(row.names(obp.qlf.Brain_BB_v_bb.TT$table)), ]
obp.qlf.Ovary_BB_v_bb_order <- obp.qlf.Ovary_BB_v_bb.TT$table[ order(row.names(obp.qlf.Ovary_BB_v_bb.TT$table)), ]
obp.qlf.Core_BB_v_Bb_order <- obp.QLF.BBvBb_TT$table[ order(row.names(obp.QLF.BBvBb_TT$table)), ]
obp.qlf.Core_BB_v_bb_order <- obp.QLF.BBvbb_TT$table[ order(row.names(obp.QLF.BBvbb_TT$table)), ]
obp.qlf.SF_order <- obp.QLF.SF_TT$table[ order(row.names(obp.QLF.SF_TT$table)), ]


#### Generate FDR heatmap (Figure S4B)
DEG_heatmap <- data.frame(rep(x = 0, 27))
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Brain_BB_v_Bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Ovary_BB_v_Bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Brain_BB_v_bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Ovary_BB_v_bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Core_BB_v_Bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Core_BB_v_bb_order$FDR)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.SF_order$FDR)
row.names(DEG_heatmap) <- row.names(obp.qlf.Core_BB_v_bb_order)
DEG_heatmap <- DEG_heatmap[,2:8]
colnames(DEG_heatmap) <- c("Brain_BB_v_Bb","Ovary_BB_v_Bb","Brain_BB_v_bb","Ovary_BB_v_bb","Core_BB_v_Bb","Core_BB_v_bb","Core_SocialForm")

# create color palette
col.pal <- brewer.pal(9,"YlOrRd")
hm.parameters <- list(DEG_heatmap, 
                      color = col.pal,
                      scale = "none",
                      kmeans_k = NA,
                      show_rownames = T, show_colnames = T,breaks=c(0,0.001,0.01,0.05,0.1,0.2,0.5,1),display_numbers=TRUE,number_format="%.3f",
                      main = "Solenopsis invicta OBP DEG Significance",
                      clustering_method = "complete",
                      cluster_rows = FALSE, cluster_cols = FALSE,
                      clustering_distance_rows = "correlation", 
                      clustering_distance_cols = "correlation")
do.call("pheatmap", hm.parameters)

#### Generate LogFC heatmap (Figure S4A)
DEG_heatmap <- data.frame(rep(x = 0, 27))
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Brain_BB_v_Bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Ovary_BB_v_Bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Brain_BB_v_bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.Ovary_BB_v_bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,-1*obp.qlf.Core_BB_v_Bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,-1*obp.qlf.Core_BB_v_bb_order$logFC)
DEG_heatmap <- cbind(DEG_heatmap,obp.qlf.SF_order$logFC)
row.names(DEG_heatmap) <- row.names(obp.qlf.Core_BB_v_bb_order)
DEG_heatmap <- DEG_heatmap[,2:8]
colnames(DEG_heatmap) <- c("Brain_BB_v_Bb","Ovary_BB_v_Bb","Brain_BB_v_bb","Ovary_BB_v_bb","Core_BB_v_Bb","Core_BB_v_bb","Core_SocialForm")

# create color palette
col.pal <- brewer.pal(9,"RdBu")
hm.parameters <- list(DEG_heatmap, 
                      color = col.pal,
                      scale = "none",
                      kmeans_k = NA,
                      show_rownames = T, show_colnames = T,display_numbers=TRUE,number_format="%.3f",
                      main = "Solenopsis invicta OBP DEG logFC",
                      clustering_method = "complete",
                      cluster_rows = FALSE, cluster_cols = FALSE,
                      clustering_distance_rows = "correlation", 
                      clustering_distance_cols = "correlation")
do.call("pheatmap", hm.parameters)

## Here we perform some additional analyses to address reviewer comments

## Pairwise Bb vs. bb comparisons
## make new volcano plots
make_volcano(pw.qlf.Ovary_Bb_v_bb,"Ovary SB/Sb vs. Sb/Sb")
## [1] 0
## [1] 0
## [1] 0
## [1] 2

make_volcano(pw.qlf.Brain_Bb_v_bb,"Brain SB/Sb vs. Sb/Sb")
## [1] 0
## [1] 1
## [1] 0
## [1] 8
## Warning: Removed 1 rows containing missing values (geom_point).

# tissue-specific UpSet plots

Brain_Bb_bb <- temp1[temp1$Row.names %in% row.names(pw.qlf.Brain_Bb_v_bb.TT.sig),]
Brain_Bb_bb
##       Row.names            chr    start      end       LOC_ID
## 1756  gene11577           lg16 17403499 17404920 LOC105202812
## 2000  gene11797           lg16 10122809 10131987 LOC105203092
## 5281   gene1475           <NA>       NA       NA LOC105206526
## 6583    gene177           lg16 10894386 10899201 LOC105207492
## 7431   gene2532 NW_011795606.1    12450    15077 LOC105192930
## 7638   gene2719           lg16 12149625 12172468 LOC105193135
## 13926  gene8379           lg16  9144321  9146201 LOC105199321
## 13943  gene8394           lg16  9317199  9319117 LOC105199336
## 13954  gene8403           lg16  9399971  9406551 LOC105199343
##                                                                      Description
## 1756                                          pre-mRNA-splicing factor Slu7-like
## 2000                                                uncharacterized LOC105203092
## 5281                                                         dopamine receptor 1
## 6583                         uncharacterized LOC105207492, transcript variant X2
## 7431                                                                        <NA>
## 7638                         uncharacterized LOC105193135, transcript variant X1
## 13926                                    copper homeostasis protein cutC homolog
## 13943 polyadenylate-binding protein-interacting protein 2, transcript variant X1
## 13954                        uncharacterized LOC105199343, transcript variant X2
##       Brain_Bb_logFC Brain_Bb_logCPM Brain_Bb_FDR Brain_bb_logFC
## 1756       1.1619521       5.2088087 2.658828e-01       7.079971
## 2000       0.7540459       3.8182881 1.879356e-01       2.187071
## 5281       0.3992948       6.0856020 8.166661e-01       2.567231
## 6583       1.1214292       6.6827817 3.003548e-01       5.145919
## 7431       0.1540111       0.2181494 9.936302e-01       5.850681
## 7638      -1.4498957       5.6683670 2.938347e-09      -2.427515
## 13926      1.0939644       3.1372041 6.277966e-01       5.224093
## 13943      1.3048902       4.4368701 9.350473e-03       3.141716
## 13954      0.6340571       5.8291791 9.367098e-02       2.141782
##       Brain_bb_logCPM Brain_bb_FDR Ovary_Bb_logFC Ovary_Bb_logCPM Ovary_Bb_FDR
## 1756        5.2088087 1.321374e-14      0.8236580       5.2088087 0.8128688802
## 2000        3.8182881 1.263121e-08      0.5140757       3.8182881 0.7934655800
## 5281        6.0856020 6.245568e-09      0.1622900       6.0856020 1.0000000000
## 6583        6.6827817 1.782271e-11      1.0639291       6.6827817 0.4907562039
## 7431        0.2181494 9.578921e-04      0.6680116       0.2181494 1.0000000000
## 7638        5.6683670 1.820967e-17     -0.9305337       5.6683670 0.0003210017
## 13926       3.1372041 1.314212e-07      0.9249539       3.1372041 0.9934274078
## 13943       4.4368701 2.558356e-11      0.5737414       4.4368701 0.8049172978
## 13954       5.8291791 2.090814e-12      0.3874658       5.8291791 0.7635615783
##       Ovary_bb_logFC Ovary_bb_logCPM Ovary_bb_FDR Multifactorial_Bb_logFC
## 1756       3.6876028       5.2088087 7.648926e-08              -1.1524809
## 2000       1.2514333       3.8182881 1.513121e-03              -0.6278227
## 5281       1.3519623       6.0856020 4.270179e-03              -0.5431052
## 6583       2.0739424       6.6827817 1.354704e-03              -0.9569147
## 7431       1.5059072       0.2181494 4.742832e-01              -0.5057420
## 7638      -1.4064056       5.6683670 1.349339e-08               1.3821503
## 13926      3.6134826       3.1372041 8.623034e-05              -1.1489277
## 13943      1.7367393       4.4368701 6.828259e-05              -1.1488844
## 13954      0.7263317       5.8291791 1.436868e-02              -0.5882147
##       Multifactorial_Bb_logCPM Multifactorial_Bb_FDR Multifactorial_bb_logFC
## 1756                 5.2088078          2.350646e-01               -7.070458
## 2000                 3.8182934          3.434857e-01               -2.060850
## 5281                 6.0856036          6.506031e-01               -2.711042
## 6583                 6.6827812          4.304093e-01               -4.981407
## 7431                 0.2182094          9.554528e-01               -6.202680
## 7638                 5.6683667          6.315581e-10                2.359770
## 13926                3.1372027          5.677740e-01               -5.279122
## 13943                4.4368712          2.041881e-02               -2.985714
## 13954                5.8291794          1.057792e-01               -2.095939
##       Multifactorial_bb_logCPM Multifactorial_bb_FDR  ASE_logFC ASE_logCPM
## 1756                 5.2088078          4.574738e-15  7.5278988  10.579215
## 2000                 3.8182934          1.866400e-08  1.1521056  10.994312
## 5281                 6.0856036          5.897525e-10 -0.5645447   9.513952
## 6583                 6.6827812          1.538864e-11         NA         NA
## 7431                 0.2182094          3.582084e-04         NA         NA
## 7638                 5.6683667          2.238215e-19 -0.2689870  13.754954
## 13926                3.1372027          4.445164e-08         NA         NA
## 13943                4.4368712          4.375044e-11  1.8420198  10.398021
## 13954                5.8291794          5.773635e-13  1.8153050  10.936319
##            ASE_FDR
## 1756  2.196734e-07
## 2000  7.095488e-03
## 5281  5.854976e-01
## 6583            NA
## 7431            NA
## 7638  3.842147e-01
## 13926           NA
## 13943 4.505039e-04
## 13954 3.733954e-06
Brain_all_sg_comps <- list(Brain_BB_Bb=row.names(pw.qlf.Brain_BB_v_Bb.TT.sig),
                           Brain_BB_bb=row.names(pw.qlf.Brain_BB_v_bb.TT.sig),
                           Brain_Bb_bb=row.names(pw.qlf.Brain_Bb_v_bb.TT.sig))
upset(fromList(Brain_all_sg_comps),nsets=3,order.by = "freq",text.scale = 1.4)

Ovary_Bb_bb <- temp1[temp1$Row.names %in% row.names(pw.qlf.Ovary_Bb_v_bb.TT.sig),]
Ovary_Bb_bb
##      Row.names  chr    start      end       LOC_ID
## 1756 gene11577 lg16 17403499 17404920 LOC105202812
## 1763 gene11583 lg16 17477820 17479836 LOC105202816
##                             Description Brain_Bb_logFC Brain_Bb_logCPM
## 1756 pre-mRNA-splicing factor Slu7-like       1.161952        5.208809
## 1763                               <NA>       0.000000        6.608818
##      Brain_Bb_FDR Brain_bb_logFC Brain_bb_logCPM Brain_bb_FDR Ovary_Bb_logFC
## 1756    0.2658828       7.079971        5.208809 1.321374e-14       0.823658
## 1763    1.0000000      -1.033023        6.608818 7.615750e-01       1.784655
##      Ovary_Bb_logCPM Ovary_Bb_FDR Ovary_bb_logFC Ovary_bb_logCPM Ovary_bb_FDR
## 1756        5.208809    0.8128689       3.687603        5.208809 7.648926e-08
## 1763        6.608818    0.1662591       9.235454        6.608818 9.057535e-11
##      Multifactorial_Bb_logFC Multifactorial_Bb_logCPM Multifactorial_Bb_FDR
## 1756             -1.15248093                 5.208808             0.2350646
## 1763             -0.04708812                 6.608817             1.0000000
##      Multifactorial_bb_logFC Multifactorial_bb_logCPM Multifactorial_bb_FDR
## 1756               -7.070458                 5.208808          4.574738e-15
## 1763                0.985932                 6.608817          6.989363e-01
##      ASE_logFC ASE_logCPM      ASE_FDR
## 1756  7.527899   10.57921 2.196734e-07
## 1763        NA         NA           NA
Ovary_all_sg_comps <- list(Ovary_BB_Bb=row.names(pw.qlf.Ovary_BB_v_Bb.TT.sig),
                           Ovary_BB_bb=row.names(pw.qlf.Ovary_BB_v_bb.TT.sig),
                           Ovary_Bb_bb=row.names(pw.qlf.Ovary_Bb_v_bb.TT.sig))
upset(fromList(Ovary_all_sg_comps),nsets=3,order.by = "freq",text.scale = 1.4)

## GLM Bb vs. bb comparisons
con.Bb_bb <- makeContrasts(GenotypeBb - Genotypebb, levels=glm_design)
glm.QLF.Bb_bb <- glmQLFTest(fit_glm,contrast=con.Bb_bb)
glm.QLF.Bb_bb_TT <- topTags(glm.QLF.Bb_bb,n=Inf)
glm.QLF.Bb_bb_TT_sig <- topTags(glm.QLF.Bb_bb_TT,p.value = 0.01,n=Inf)$table

make_volcano(glm.QLF.Bb_bb,"GLM SB/Sb vs. Sb/Sb")
## [1] 0
## [1] 1
## [1] 0
## [1] 8
## Warning: Removed 1 rows containing missing values (geom_point).

Present Session Info

save.image(file = "Molecular_Ecology_Markdown_v2.RData")
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dplyr_1.0.0          rentrez_1.2.2        rtracklayer_1.42.2  
##  [4] readxl_1.3.1         randtests_1.0        RColorBrewer_1.1-2  
##  [7] CircStats_0.2-6      boot_1.3-25          MASS_7.3-51.6       
## [10] pheatmap_1.0.12      eulerr_6.1.0         data.table_1.12.8   
## [13] topGO_2.34.0         SparseM_1.78         GO.db_3.7.0         
## [16] AnnotationDbi_1.44.0 Biobase_2.42.0       graph_1.60.0        
## [19] ggplot2_3.3.1        cowplot_1.0.0        karyoploteR_1.8.8   
## [22] regioneR_1.14.0      GenomicRanges_1.34.0 GenomeInfoDb_1.18.2 
## [25] IRanges_2.16.0       S4Vectors_0.20.1     BiocGenerics_0.28.0 
## [28] gridExtra_2.3        magick_2.3           readr_1.3.1         
## [31] UpSetR_1.4.0         edgeR_3.24.3         limma_3.38.3        
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1            ellipsis_0.3.1             
##  [3] biovizBase_1.30.1           htmlTable_1.13.3           
##  [5] XVector_0.22.0              base64enc_0.1-3            
##  [7] dichromat_2.0-0             rstudioapi_0.11            
##  [9] farver_2.0.3                bit64_0.9-7                
## [11] fansi_0.4.1                 splines_3.5.1              
## [13] knitr_1.28                  polyclip_1.10-0            
## [15] jsonlite_1.6.1              Formula_1.2-3              
## [17] Rsamtools_1.34.1            cluster_2.1.0              
## [19] compiler_3.5.1              httr_1.4.1                 
## [21] backports_1.1.8             assertthat_0.2.1           
## [23] Matrix_1.2-18               lazyeval_0.2.2             
## [25] cli_2.0.2                   formatR_1.7                
## [27] acepack_1.4.1               htmltools_0.5.0            
## [29] prettyunits_1.1.1           tools_3.5.1                
## [31] gtable_0.3.0                glue_1.4.1                 
## [33] GenomeInfoDbData_1.2.0      Rcpp_1.0.4.6               
## [35] cellranger_1.1.0            vctrs_0.3.1                
## [37] Biostrings_2.50.2           polylabelr_0.2.0           
## [39] xfun_0.14                   stringr_1.4.0              
## [41] lifecycle_0.2.0             ensembldb_2.6.8            
## [43] XML_3.99-0.3                zlibbioc_1.28.0            
## [45] scales_1.1.1                BSgenome_1.50.0            
## [47] VariantAnnotation_1.28.13   hms_0.5.3                  
## [49] ProtGenerics_1.14.0         SummarizedExperiment_1.12.0
## [51] AnnotationFilter_1.6.0      yaml_2.2.1                 
## [53] curl_4.3                    memoise_1.1.0              
## [55] biomaRt_2.38.0              rpart_4.1-15               
## [57] latticeExtra_0.6-28         stringi_1.4.6              
## [59] RSQLite_2.2.0               checkmate_2.0.0            
## [61] GenomicFeatures_1.34.8      BiocParallel_1.16.6        
## [63] rlang_0.4.6                 pkgconfig_2.0.3            
## [65] matrixStats_0.56.0          bitops_1.0-6               
## [67] evaluate_0.14               lattice_0.20-41            
## [69] purrr_0.3.4                 labeling_0.3               
## [71] GenomicAlignments_1.18.1    htmlwidgets_1.5.1          
## [73] bit_1.1-15.2                tidyselect_1.1.0           
## [75] plyr_1.8.6                  magrittr_1.5               
## [77] R6_2.4.1                    generics_0.0.2             
## [79] Hmisc_4.4-0                 DelayedArray_0.8.0         
## [81] DBI_1.1.0                   pillar_1.4.4               
## [83] foreign_0.8-72              withr_2.2.0                
## [85] survival_3.2-3              RCurl_1.98-1.2             
## [87] nnet_7.3-14                 tibble_3.0.1               
## [89] crayon_1.3.4                utf8_1.1.4                 
## [91] rmarkdown_2.3               bamsignals_1.14.0          
## [93] progress_1.2.2              locfit_1.5-9.4             
## [95] grid_3.5.1                  blob_1.2.1                 
## [97] digest_0.6.25               bezier_1.1.2               
## [99] munsell_0.5.0